BibTeX bibliography tomccap.bib

%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.68",
%%%     date            = "10 April 2024",
%%%     time            = "08:45:51 MST",
%%%     filename        = "tomccap.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "46515 54327 273147 2644697",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography; BibTeX; ACM Transactions on
%%%                        Multimedia Computing, Communications, and
%%%                        Applications; TOMCCAP; TOMM",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Multimedia Computing,
%%%                        Communications, and Applications (CODEN
%%%                        ????, ISSN 1551-6857), completely covering
%%%                        all issues from volume 1, number 1, February
%%%                        2005 to date.
%%%
%%%                        NB: On 23-May-2014, the journal acronym was
%%%                        changed by ACM from TOMCCAP to TOMM, but the
%%%                        full journal name remains unchanged, and
%%%                        volume / number / pages values are not
%%%                        affected by the change.  The BibTeX journal
%%%                        abbreviation has therefore changed at volume
%%%                        10, number 4, June 2014, from j-TOMMCAP to
%%%                        j-TOMM.  The filename remains tommcap.bib; no
%%%                        tomm.bib exists at the master archive site.
%%%
%%%                        The ACM maintains World Wide Web pages with
%%%                        journal tables of contents for 2005--date at
%%%
%%%                            http://www.acm.org/tomccap/
%%%                            http://www.acm.org/pubs/contents/journals/tomccap/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J961
%%%
%%%                        That data has been automatically converted to
%%%                        BibTeX form, corrected for spelling and page
%%%                        number errors, and merged into this file.
%%%
%%%                        At version 1.68, the COMPLETE year coverage
%%%                        looks like this:
%%%
%%%                             2005 (  20)    2012 (  56)    2019 (  62)
%%%                             2006 (  18)    2013 (  62)    2020 ( 122)
%%%                             2007 (  27)    2014 (  55)    2021 ( 168)
%%%                             2008 (  45)    2015 (  51)    2022 ( 153)
%%%                             2009 (  14)    2016 (  31)    2023 ( 209)
%%%                             2010 (  31)    2017 (  63)    2024 ( 150)
%%%                             2011 (  41)    2018 (  75)
%%%
%%%                             Article:       1453
%%%
%%%                             Total entries: 1453
%%%
%%%                        Spelling has been verified with the UNIX
%%%                        spell and GNU ispell programs using the
%%%                        exception dictionary stored in the companion
%%%                        file with extension .sok.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed by the
%%%                        author for the BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted
%%%                        by journal, and then by publication order,
%%%                        with the help of ``bibsort -byvolume''.  The
%%%                        bibsort utility is available from
%%%                        ftp://ftp.math.utah.edu/pub/tex/bib.
%%%
%%%                        The author will be grateful for reports of
%%%                        errors of any kind in this bibliography.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
   "\ifx \Thorn \undefined \def \Thorn {T}\fi" #
   "\hyphenation{
   }"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% From the ACM Portal Web site: ``On 23rd May 2014, ACM TOMCCAP
%%% changed its acronym to ACM TOMM. This acronym change was the result
%%% of extensive discussions between the journal Editorial Board and
%%% SIGMM constituents dating back to 2011. This name change emphasizes
%%% the continued strong collaboration with the ACM Multimedia
%%% conference (ACMMM).''
%%%
%%% Journal abbreviations:

@String{j-TOMCCAP               = "ACM Transactions on Multimedia Computing,
                                  Communications, and Applications"}

@String{j-TOMM                  = "ACM Transactions on Multimedia Computing,
                                  Communications, and Applications"}

%%% ====================================================================
%%% Bibliography entries sorted in publication order:

@Article{Georganas:2005:EBA,
  author =       "Nicolas D. Georganas",
  title =        "{Editorial}: {The} birth of the {ACM Transactions on
                 Multimedia Computing, Communications and Applications}
                 {(TOMCCAP)}",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "1--2",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rowe:2005:ASR,
  author =       "Lawrence A. Rowe and Ramesh Jain",
  title =        "{ACM SIGMM Retreat} report on future directions in
                 multimedia research",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "3--13",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jain:2005:GEI,
  author =       "Ramesh Jain and Thomas Plagemann and Ralf Steinmetz",
  title =        "Guest editorial: {The International ACM Multimedia
                 Conference 1993} --- ten years after",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "14--15",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Teodosio:2005:SS,
  author =       "Laura Teodosio and Walter Bender",
  title =        "Salient stills",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "16--36",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Reddy:2005:DSM,
  author =       "A. L. N. Reddy and Jim Wyllie and K. B. R.
                 Wijayaratne",
  title =        "Disk scheduling in a multimedia {I/O} system",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "37--59",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Buchanan:2005:ATL,
  author =       "M. Cecelia Buchanan and Polle T. Zellweger",
  title =        "Automatic temporal layout mechanisms revisited",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "60--88",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bulterman:2005:SMA,
  author =       "Dick C. A. Bulterman and Lynda Hardman",
  title =        "Structured multimedia authoring",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "89--109",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mayer-Patel:2005:BSM,
  author =       "Ketan Mayer-Patel and Brian C. Smith and Lawrence A.
                 Rowe",
  title =        "The {Berkeley} software {MPEG-1} video decoder",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "1",
  pages =        "110--125",
  month =        feb,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Apr 14 11:01:03 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Plagemann:2005:SPA,
  author =       "Thomas Plagemann and Prashant Shenoy and John R.
                 Smith",
  title =        "Selected papers from the {ACM Multimedia Conference
                 2003}",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "2",
  pages =        "127--127",
  month =        may,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Jul 7 13:52:13 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kum:2005:RTM,
  author =       "Sang-Uok Kum and Ketan Mayer-Patel",
  title =        "Real-time multidepth stream compression",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "2",
  pages =        "128--150",
  month =        may,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Jul 7 13:52:13 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Feng:2005:PSL,
  author =       "Wu-Chi Feng and Ed Kaiser and Wu Chang Feng and Mikael
                 Le Baillif",
  title =        "{Panoptes}: scalable low-power video sensor networking
                 technologies",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "2",
  pages =        "151--167",
  month =        may,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Jul 7 13:52:13 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Goh:2005:SFD,
  author =       "Kingshy Goh and Beitao Li and Edward Y. Chang",
  title =        "Semantics and feature discovery via confidence-based
                 ensemble",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "2",
  pages =        "168--189",
  month =        may,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Jul 7 13:52:13 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Baker:2005:UPC,
  author =       "H. Harlyn Baker and Nina Bhatti and Donald Tanguay and
                 Irwin Sobel and Dan Gelb and Michael E. Goss and W.
                 Bruce Culbertson and Thomas Malzbender",
  title =        "Understanding performance in {Coliseum}, an immersive
                 videoconferencing system",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "2",
  pages =        "190--210",
  month =        may,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Jul 7 13:52:13 MDT 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Adams:2005:IIM,
  author =       "Brett Adams and Svetha Venkatesh and Ramesh Jain",
  title =        "{IMCE}: {Integrated} media creation environment",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "3",
  pages =        "211--247",
  month =        aug,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Fri Nov 18 08:30:19 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Poellabauer:2005:FCD,
  author =       "Christian Poellabauer and Karsten Schwan",
  title =        "Flexible cross-domain event delivery for
                 quality-managed multimedia applications",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "3",
  pages =        "248--268",
  month =        aug,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Fri Nov 18 08:30:19 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cooper:2005:TEC,
  author =       "Matthew Cooper and Jonathan Foote and Andreas
                 Girgensohn and Lynn Wilcox",
  title =        "Temporal event clustering for digital photo
                 collections",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "3",
  pages =        "269--288",
  month =        aug,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Fri Nov 18 08:30:19 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2005:CEM,
  author =       "Keqiu Li and Hong Shen",
  title =        "Coordinated enroute multimedia object caching in
                 transcoding proxies for tree networks",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "3",
  pages =        "289--314",
  month =        aug,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Fri Nov 18 08:30:19 MST 2005",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2005:AFE,
  author =       "Huahui Wu and Mark Claypool and Robert Kinicki",
  title =        "Adjusting forward error correction with temporal
                 scaling for {TCP}-friendly streaming {MPEG}",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "4",
  pages =        "315--337",
  month =        nov,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cai:2005:LUL,
  author =       "Jianfei Cai and Xiangjun Li and Chang Wen Chen",
  title =        "Layered unequal loss protection with pre-interleaving
                 for fast progressive image transmission over
                 packet-loss channels",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "4",
  pages =        "338--353",
  month =        nov,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tu:2005:ASP,
  author =       "Yi-Cheng Tu and Jianzhong Sun and Mohamed Hefeeda and
                 Sunil Prabhakar",
  title =        "An analytical study of peer-to-peer media streaming
                 systems",
  journal =      j-TOMCCAP,
  volume =       "1",
  number =       "4",
  pages =        "354--376",
  month =        nov,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lew:2006:CBM,
  author =       "Michael S. Lew and Nicu Sebe and Chabane Djeraba and
                 Ramesh Jain",
  title =        "Content-based multimedia information retrieval:
                 {State} of the art and challenges",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "1",
  pages =        "1--19",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{DelBimbo:2006:CBR,
  author =       "Alberto {Del Bimbo} and Pietro Pala",
  title =        "Content-based retrieval of {$3$D} models",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "1",
  pages =        "20--43",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Xu:2006:FAF,
  author =       "Huaxin Xu and Tat-Seng Chua",
  title =        "Fusion of {AV} features and external information
                 sources for event detection in team sports video",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "1",
  pages =        "44--67",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Joshi:2006:SPE,
  author =       "Dhiraj Joshi and James Z. Wang and Jia Li",
  title =        "The {Story Picturing Engine}---a system for automatic
                 text illustration",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "1",
  pages =        "68--89",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Snoek:2006:LRS,
  author =       "Cees G. M. Snoek and Marcel Worring and Alexander G.
                 Hauptmann",
  title =        "Learning rich semantics from news video archives by
                 style analysis",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "2",
  pages =        "91--108",
  month =        may,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2006:SER,
  author =       "Guang Yang and Tony Sun and Mario Gerla and M. Y.
                 Sanadidi and Ling-Jyh Chen",
  title =        "Smooth and efficient real-time video transport in the
                 presence of wireless errors",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "2",
  pages =        "109--126",
  month =        may,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shao:2006:ASM,
  author =       "Xi Shao and Changsheng Xu and Namunu C. Maddage and Qi
                 Tian and Mohan S. Kankanhalli and Jesse S. Jin",
  title =        "Automatic summarization of music videos",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "2",
  pages =        "127--148",
  month =        may,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Eide:2006:RTV,
  author =       "Viktor S. Wold Eide and Ole-Christoffer Granmo and
                 Frank Eliassen and J{\o}rgen Andreas Michaelsen",
  title =        "Real-time video content analysis: {QoS}-aware
                 application composition and parallel processing",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "2",
  pages =        "149--172",
  month =        may,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Candan:2006:ISI,
  author =       "K. Sel{\c{c}}uk Candan and Augusto Celentano and
                 Wolfgang Klas",
  title =        "Introduction to special issue on the use of context in
                 multimedia information systems",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "3",
  pages =        "173--176",
  month =        aug,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ferrara:2006:SWO,
  author =       "Alfio Ferrara and Luca A. Ludovico and Stefano
                 Montanelli and Silvana Castano and Goffredo Haus",
  title =        "A {Semantic Web} ontology for context-based
                 classification and retrieval of music resources",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "3",
  pages =        "177--198",
  month =        aug,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Arigon:2006:HMP,
  author =       "Anne-Muriel Arigon and Anne Tchounikine and Maryvonne
                 Miquel",
  title =        "Handling multiple points of view in a multimedia data
                 warehouse",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "3",
  pages =        "199--218",
  month =        aug,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kahol:2006:MCH,
  author =       "Kanav Kahol and Priyamvada Tripathi and Troy Mcdaniel
                 and Laura Bratton and Sethuraman Panchanathan",
  title =        "Modeling context in haptic perception, rendering, and
                 visualization",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "3",
  pages =        "219--240",
  month =        aug,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Thu Sep 7 16:13:26 MDT 2006",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gulliver:2006:DUP,
  author =       "Stephen R. Gulliver and Gheorghita Ghinea",
  title =        "Defining user perception of distributed multimedia
                 quality",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "241--257",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gopalan:2006:SAC,
  author =       "Kartik Gopalan and Lan Huang and Gang Peng and
                 Tzi-Cker Chiueh and Yow-Jian Lin",
  title =        "Statistical admission control using delay distribution
                 measurements",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "258--281",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2006:MSP,
  author =       "H. Li and M. Li and B. Prabhakaran",
  title =        "Middleware for streaming {$3$D} progressive meshes
                 over lossy networks",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "282--317",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Etsion:2006:PPU,
  author =       "Yoav Etsion and Dan Tsafrir and Dror G. Feitelson",
  title =        "Process prioritization using output production:
                 {Scheduling} for multimedia",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "318--342",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cesar:2006:GAH,
  author =       "Pablo Cesar and Petri Vuorimaa and Juha Vierinen",
  title =        "A graphics architecture for high-end interactive
                 television terminals",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "343--357",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Madhwacharyula:2006:MHV,
  author =       "Chitra L. Madhwacharyula and Marc Davis and Philippe
                 Mulhem and Mohan S. Kankanhalli",
  title =        "Metadata handling: a video perspective",
  journal =      j-TOMCCAP,
  volume =       "2",
  number =       "4",
  pages =        "358--388",
  month =        nov,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Atrey:2007:GOO,
  author =       "Pradeep K. Atrey and Mohan S. Kankanhalli and John B.
                 Oommen",
  title =        "Goal-oriented optimal subset selection of correlated
                 multimedia streams",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2007:DSI,
  author =       "Datong Chen and Jie Yang and Robert Malkin and Howard
                 D. Wactlar",
  title =        "Detecting social interactions of the elderly in a
                 nursing home environment",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Heck:2007:VV,
  author =       "Rachel Heck and Michael Wallick and Michael Gleicher",
  title =        "Virtual videography",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Truong:2007:VAS,
  author =       "Ba Tu Truong and Svetha Venkatesh",
  title =        "Video abstraction: a systematic review and
                 classification",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Xu:2007:CAD,
  author =       "Changsheng Xu and Namunu C. Maddage and Xi Shao and Qi
                 Tian",
  title =        "Content-adaptive digital music watermarking based on
                 music structure analysis",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yan:2007:MSO,
  author =       "Wei-Qi Yan and Mohan S. Kankanhalli",
  title =        "Multimedia simplification for optimized {MMS}
                 synthesis",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "1",
  pages =        "??--??",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Apr 14 11:19:17 MDT 2007",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2007:CAT,
  author =       "Tiecheng Liu and John R. Kender",
  title =        "Computational approaches to temporal sampling of video
                 sequences",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230813",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video key frame extraction is one of the most
                 important research problems for video summarization,
                 indexing, and retrieval. For a variety of applications
                 such as ubiquitous media access and video streaming,
                 the temporal boundaries between video key frames are
                 required for synchronizing visual content with audio.
                 In this article, we define temporal video sampling as a
                 unified process of extracting video key frames and
                 computing their temporal boundaries, and formulate it
                 as an optimization problem. We first provide an optimal
                 approach that minimizes temporal video sampling error
                 using a dynamic programming process. The optimal
                 approach retrieves a key frame hierarchy and all
                 temporal boundaries in $ O(n^4) $ time and $ O(n^2) $
                 space. To further reduce computational complexity, we
                 also provide a suboptimal greedy algorithm that
                 exploits the data structure of a binary heap and uses a
                 novel ``look-ahead'' computational technique, enabling
                 all levels of key frames to be extracted with an
                 average-case computational time of $ O(n \log n) $ and
                 memory usage of $ O(n) $. Both the optimal and the
                 greedy methods are free of parameters, thus avoiding
                 the threshold-selection problem that exists in other
                 approaches. We empirically compare the proposed optimal
                 and greedy methods with several existing methods in
                 terms of video sampling error, computational cost, and
                 subjective quality. An evaluation of eight videos of
                 different genres shows that the greedy approach
                 achieves performance very close to that of the optimal
                 approach while drastically reducing computational cost,
                 making it suitable for processing long video sequences
                 in large video databases.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "key frame selection; temporal video sampling;
                 ubiquitous media access; video content analysis; video
                 summarization",
}

@Article{Moncrieff:2007:OAB,
  author =       "Simon Moncrieff and Svetha Venkatesh and Geoff West",
  title =        "Online audio background determination for complex
                 audio environments",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230814",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We present a method for foreground/background
                 separation of audio using a background modelling
                 technique. The technique models the background in an
                 online, unsupervised, and adaptive fashion, and is
                 designed for application to long term surveillance and
                 monitoring problems. The background is determined using
                 a statistical method to model the states of the audio
                 over time. In addition, three methods are used to
                 increase the accuracy of background modelling in
                 complex audio environments. Such environments can cause
                 the failure of the statistical model to accurately
                 capture the background states. An entropy-based
                 approach is used to unify background representations
                 fragmented over multiple states of the statistical
                 model. The approach successfully unifies such
                 background states, resulting in a more robust
                 background model. We adaptively adjust the number of
                 states considered background according to background
                 complexity, resulting in the more accurate
                 classification of background models. Finally, we use an
                 auxiliary model cache to retain potential background
                 states in the system. This prevents the deletion of
                 such states due to a rapid influx of observed states
                 that can occur for highly dynamic sections of the audio
                 signal. The separation algorithm was successfully
                 applied to a number of audio environments representing
                 monitoring applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "audio analysis; online background modelling;
                 surveillance and monitoring",
}

@Article{Oshima:2007:PDS,
  author =       "Chika Oshima and Kazushi Nishimoto and Norihiro
                 Hagita",
  title =        "A piano duo support system for parents to lead
                 children to practice musical performances",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230815",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose ``Family Ensemble,'' a
                 piano duo support system for a musically inept parent
                 and his/her child who is a beginner at playing the
                 piano. The system makes it easier for parents to
                 correctly reproduce a given sequence of pitches along
                 with the child's performance by using score tracking
                 and note-replacement functions. The experiments with
                 this support system showed that the parents can
                 immediately participate in the piano duo. Furthermore,
                 we found that during joint practices using Family
                 Ensemble some subjects discussed musical ideas that
                 they would not have talked about without using the
                 system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "entertainment; musical expression; piano duo; score
                 tracking; support system",
}

@Article{He:2007:CSW,
  author =       "Xiaofei He and Deng Cai and Ji-Rong Wen and Wei-Ying
                 Ma and Hong-Jiang Zhang",
  title =        "Clustering and searching {WWW} images using link and
                 page layout analysis",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230816",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Due to the rapid growth of the number of digital
                 images on the Web, there is an increasing demand for an
                 effective and efficient method for organizing and
                 retrieving the available images. This article describes
                 iFind, a system for clustering and searching WWW
                 images. By using a vision-based page segmentation
                 algorithm, a Web page is partitioned into blocks, and
                 the textual and link information of an image can be
                 accurately extracted from the block containing that
                 image. The textual information is used for image
                 indexing. By extracting the page-to-block,
                 block-to-image, block-to-page relationships through
                 link structure and page layout analysis, we construct
                 an image graph. Our method is less sensitive to noisy
                 links than previous methods like PageRank, HITS, and
                 PicASHOW, and hence the image graph can better reflect
                 the semantic relationship between images. Using the
                 notion of Markov Chain, we can compute the limiting
                 probability distributions of the images, ImageRanks,
                 which characterize the importance of the images. The
                 ImageRanks are combined with the relevance scores to
                 produce the final ranking for image search. With the
                 graph models, we can also use techniques from spectral
                 graph theory for image clustering and embedding, or 2-D
                 visualization. Some experimental results on 11.6
                 million images downloaded from the Web are provided in
                 the article.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "image clustering; image search; link analysis; Web
                 mining",
}

@Article{Jung:2007:NBA,
  author =       "Byunghee Jung and Junehwa Song and Yoonjoon Lee",
  title =        "A narrative-based abstraction framework for
                 story-oriented video",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230817",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes a novel video abstraction
                 framework for online review services of story-oriented
                 videos such as dramas. Among the many genres of TV
                 programs, a drama is one of the most popularly watched
                 on the Web. The abstracts generated by the proposed
                 framework not only give a summary of a video but also
                 effectively help viewers understand the overall story.
                 In addition, our method is duration-flexible. We get
                 clues about human understanding of a story from
                 scenario writing rules and editorial techniques that
                 are popularly used in the process of video production
                 to explicitly express a narrative, and propose a new
                 video abstraction model, called a Narrative Abstraction
                 Model. The model effectively captures the narrative
                 structure embedded in a story-oriented video and
                 articulates the progress of the story in a weighted
                 directed graph, called a Narrative Structure Graph
                 (NSG). The model provides a basis for a flexible
                 framework for abstract generation using the NSG as the
                 intermediary representation of a video. Different
                 abstracts can be appropriately generated based upon
                 different user requirements. To show the effectiveness
                 of the proposed model and method, we developed a video
                 abstraction system realizing the framework, and
                 successfully applied it to large volumes of TV dramas.
                 The evaluation results show that the proposed framework
                 is a feasible solution for online review services.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "film; narrative structure; online review services;
                 story understanding; story-oriented; video abstraction;
                 video abstraction system",
}

@Article{Shacham:2007:UDP,
  author =       "Ron Shacham and Henning Schulzrinne and Srisakul
                 Thakolsri and Wolfgang Kellerer",
  title =        "Ubiquitous device personalization and use: {The} next
                 generation of {IP} multimedia communications",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1230812.1230818",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:04 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Service usage in emerging ubiquitous environments
                 includes seamless and personalized usage of public and
                 private devices discovered in the vicinity of a user.
                 In our work, we describe an architecture for device
                 discovery, device configuration, and the transfer of
                 active sessions between devices. The presented
                 architecture uses the Session Initiation Protocol (SIP)
                 as a standardized, widely used signaling protocol for
                 IP-based multimedia services. Our solution includes
                 support of simple existing devices, split of sessions
                 between devices, user-control of location-based
                 behavior, and handling of security and privacy
                 concerns. We present the implementation and show the
                 feasibility of our work with analytical evaluation and
                 measurements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Internet multimedia; location-based services; mobile
                 communications; ubiquitous computing",
}

@Article{Chen:2007:EMO,
  author =       "Herng-Yow Chen and Sheng-Wei Li",
  title =        "Exploring many-to-one speech-to-text correlation for
                 {Web}-based language learning",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236472",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article investigates the correlations between
                 multimedia objects (particularly speech and text)
                 involved in language lectures in order to design an
                 effective presentation mechanism for web-based
                 learning. The cross-media correlations are classified
                 into implicit relations (retrieved by computing) and
                 explicit relations (recorded during the preprocessing
                 stage). The implicit temporal correlation between
                 speech and text is primarily to help to negotiate
                 supplementary lecture navigations like tele-pointer
                 movement, lips-sync movement, and content scrolling. We
                 propose a speech-text alignment framework, using an
                 iterative algorithm based on local alignment, to probe
                 many-to-one temporal correlations, and not the
                 one-to-one only. The proposed framework is a more
                 practical method for analyzing general language
                 lectures, and the algorithm's time complexity conforms
                 to the best-possible computation cost, O(nm), without
                 introducing additional computation. In addition, we
                 have shown the feasibility of creating vivid
                 presentations by exploiting implicit relations and
                 artificially simulating some explicit media. To
                 facilitate the navigation of integrated multimedia
                 documents, we develop several visualization techniques
                 for describing media correlations, including guidelines
                 for speech-text correlations, visible-automatic
                 scrolling, and levels of detail of timeline, to provide
                 intuitive and easy-to-use random access mechanisms. We
                 evaluated the performance of the analysis method and
                 human perceptions of the synchronized presentation. The
                 overall performance of the analysis method is that
                 about 99.5\% of the words analyzed are of a temporal
                 error within 0.5 sec and the subjective evaluation
                 result shows that the synchronized presentation is
                 highly acceptable to human beings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "analysis and presentation; computed synchronization;
                 cross-media correlation; lips sync; speech-to-text
                 alignment",
}

@Article{Wang:2007:EST,
  author =       "Surong Wang and Manoranjan Dash and Liang-Tien Chia
                 and Min Xu",
  title =        "Efficient sampling of training set in large and noisy
                 multimedia data",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236473",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "As the amount of multimedia data is increasing
                 day-by-day thanks to less expensive storage devices and
                 increasing numbers of information sources, machine
                 learning algorithms are faced with large-sized and
                 noisy datasets. Fortunately, the use of a good sampling
                 set for training influences the final results
                 significantly. But using a simple random sample (SRS)
                 may not obtain satisfactory results because such a
                 sample may not adequately represent the large and noisy
                 dataset due to its blind approach in selecting samples.
                 The difficulty is particularly apparent for huge
                 datasets where, due to memory constraints, only very
                 small sample sizes are used. This is typically the case
                 for multimedia applications, where data size is usually
                 very large. In this article we propose a new and
                 efficient method to sample of large and noisy
                 multimedia data. The proposed method is based on a
                 simple distance measure that compares the histograms of
                 the sample set and the whole set in order to estimate
                 the representativeness of the sample. The proposed
                 method deals with noise in an elegant manner which SRS
                 and other methods are not able to deal with. We
                 experiment on image and audio datasets. Comparison with
                 SRS and other methods shows that the proposed method is
                 vastly superior in terms of sample representativeness,
                 particularly for small sample sizes although time-wise
                 it is comparable to SRS, the least expensive method in
                 terms of time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "audio event identification; histogram; image
                 classification; noise; sampling",
}

@Article{Zhou:2007:CCO,
  author =       "Suiping Zhou and Wentong Cai and Stephen J. Turner and
                 Bu-Sung Lee and Junhu Wei",
  title =        "Critical causal order of events in distributed virtual
                 environments",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236474",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We investigate the causal order of events in
                 distributed virtual environments (DVEs). We first
                 define the critical causal order relation among the
                 events. Then, we propose some mechanisms to enhance the
                 prevalent RO (receive order delivery) mechanism in DVEs
                 so that the real-time property of DVEs is preserved
                 while the critical causal order violations are reduced.
                 These mechanisms are implemented as a middleware.
                 Experimental results show that the middleware performs
                 well in reducing the critical causality violations in
                 simulation and incurs little processing overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "causal order; distributed simulation; virtual
                 environments",
}

@Article{Li:2007:SRM,
  author =       "Chuanjun Li and S. Q. Zheng and B. Prabhakaran",
  title =        "Segmentation and recognition of motion streams by
                 similarity search",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236475",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Fast and accurate recognition of motion data streams
                 from gesture sensing and motion capture devices has
                 many applications and is the focus of this article.
                 Based on the analysis of the geometric structures
                 revealed by singular value decompositions (SVD) of
                 motion data, a similarity measure is proposed for
                 simultaneously segmenting and recognizing motion
                 streams. A direction identification approach is
                 explored to further differentiate motions with similar
                 data geometric structures. Experiments show that the
                 proposed similarity measure can segment and recognize
                 motion streams of variable lengths with high accuracy,
                 without knowing beforehand the number of motions in a
                 stream.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "gesture recognition; motion capture; pattern analysis;
                 principal component analysis; segmentation; similarity
                 measures; singular value decomposition",
}

@Article{Ott:2007:OAT,
  author =       "David E. Ott and Ketan Mayer-Patel",
  title =        "An open architecture for transport-level protocol
                 coordination in distributed multimedia applications",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236476",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We consider the problem of flow coordination in
                 distributed multimedia applications. Most
                 transport-level protocols are designed to operate
                 independently and lack mechanisms for sharing
                 information with other flows and coordinating data
                 transport in various ways. This limitation becomes
                 problematic in distributed applications that employ
                 numerous flows between two computing clusters sharing
                 the same intermediary forwarding path across the
                 Internet. In this article, we propose an open
                 architecture that supports the sharing of network state
                 information, peer flow information, and
                 application-specific information. Called simply the
                 coordination protocol (CP), the scheme facilitates
                 coordination of network resource usage across flows
                 belonging to the same application, as well as aiding
                 other types of coordination. The effectiveness of our
                 approach is illustrated in the context of
                 multistreaming in 3D tele-immersion where consistency
                 of network information across flows both greatly
                 improves frame transport synchrony and minimizes
                 buffering delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "distributed applications; flow coordination; network
                 protocols",
}

@Article{Sakr:2007:RCB,
  author =       "Ziad Sakr and Nicolas D. Georganas",
  title =        "Robust content-based {MPEG}-4 {XMT} scene structure
                 authentication and multimedia content location",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1236471.1236477",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:10:32 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For the past decade, there have been numerous research
                 works focusing on the protection of digital images,
                 audio, video, 3D virtual scenes, and software data from
                 unauthorized use and distribution. With the emerging
                 technology of the MPEG-4 standard, MPEG-4 scenes that
                 may include images, video, audio, and 3D objects can
                 easily be built using the text-based MPEG-4 XMT
                 standard. XMT allows content authors to exchange their
                 content with other authors, tools, or service providers
                 and facilitates interoperability with MPEG-4, X3D, and
                 SMIL. In order for owners and designers to protect
                 and/or authenticate their work, some form of security
                 needs to be applied into the MPEG-4 XMT structure and
                 its media content. Unlike images or videos,
                 watermarking an XMT structure is not an easy task,
                 since the structure contains no noise components to
                 embed the watermark. This article is the first one
                 proposing a novel robust algorithm for the
                 authentication of a given MPEG-4 XMT structured scene
                 and the location of its multimedia content.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "MPEG-4; multimedia; polynomial; pseudorandom
                 sequences; steganography; VRML; watermarking; XML;
                 XMT",
}

@Article{Ghinea:2007:ISI,
  author =       "Gheorghita Ghinea and Chabane Djeraba and Stephen
                 Gulliver and Kara Pernice Coyne",
  title =        "Introduction to special issue on eye-tracking
                 applications in multimedia systems",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "1:1--1:4",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314304",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Colombo:2007:RTR,
  author =       "Carlo Colombo and Dario Comanducci and Alberto {Del
                 Bimbo}",
  title =        "Robust tracking and remapping of eye appearance with
                 passive computer vision",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "2:1--2:20",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314305",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A single-camera iris-tracking and remapping approach
                 based on passive computer vision is presented. Tracking
                 is aimed at obtaining accurate and robust measurements
                 of the iris/pupil position. To this purpose, a robust
                 method for ellipse fitting is used, employing search
                 constraints so as to achieve better performance with
                 respect to the standard RANSAC algorithm. Tracking also
                 embeds an iris localization algorithm (working as a
                 bootstrap multiple-hypotheses generation step), and a
                 blink detector that can detect voluntary eye blinks in
                 human-computer interaction applications. On-screen
                 remapping incorporates a head-tracking method capable
                 of compensating for small user-head movements. The
                 approach operates in real time under different light
                 conditions and in the presence of distractors. An
                 extensive set of experiments is presented and
                 discussed. In particular, an evaluation method for the
                 choice of layout of both hardware components and
                 calibration points is described. Experiments also
                 investigate the importance of providing a visual
                 feedback to the user, and the benefits gained from
                 performing head compensation, especially during
                 image-to-screen map calibration.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "eye blink detection; eye tracking and remapping;
                 eye-driven human-computer interaction; robust fitting",
}

@Article{Wang:2007:UGP,
  author =       "Jun Wang and Lijun Yin and Jason Moore",
  title =        "Using geometric properties of topographic manifold to
                 detect and track eyes for human-computer interaction",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "3:1--3:20",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314306",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Automatic eye detection and tracking is an important
                 component for advanced human-computer interface design.
                 Accurate eye localization can help develop a successful
                 system for face recognition and emotion identification.
                 In this article, we propose a novel approach to detect
                 and track eyes using geometric surface features on
                 topographic manifold of eye images. First, in the joint
                 spatial-intensity domain, a facial image is treated as
                 a 3D terrain surface or image topographic manifold. In
                 particular, eye regions exhibit certain intrinsic
                 geometric traits on this topographic manifold, namely,
                 the pit -labeled center and hillside -like surround
                 regions. Applying a terrain classification procedure on
                 the topographic manifold of facial images, each
                 location of the manifold can be labeled to generate a
                 terrain map. We use the distribution of terrain labels
                 to represent the eye terrain pattern. The Bhattacharyya
                 affinity is employed to measure the distribution
                 similarity between two topographic manifolds. Based on
                 the Bhattacharyya kernel, a support vector machine is
                 applied for selecting proper eye pairs from the
                 pit-labeled candidates. Second, given detected eyes on
                 the first frame of a video sequence, a
                 mutual-information-based fitting function is defined to
                 describe the similarity between two terrain surfaces of
                 neighboring frames. By optimizing the fitting function,
                 eye locations are updated for subsequent frames. The
                 distinction of the proposed approach lies in that both
                 eye detection and eye tracking are performed on the
                 derived topographic manifold, rather than on an
                 original-intensity image domain. The robustness of the
                 approach is demonstrated under various imaging
                 conditions and with different facial appearances, using
                 both static images and video sequences without
                 background constraints.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Bhattacharyya affinity; eye detection; eye tracking;
                 mutual information; topographic manifold",
}

@Article{Agrafiotis:2007:TEC,
  author =       "D. Agrafiotis and S. J. C. Davies and N. Canagarajah
                 and D. R. Bull",
  title =        "Towards efficient context-specific video coding based
                 on gaze-tracking analysis",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "4:1--4:15",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314307",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article discusses a framework for model-based,
                 context-dependent video coding based on exploitation of
                 characteristics of the human visual system. The system
                 utilizes variable-quality coding based on priority maps
                 which are created using mostly context-dependent rules.
                 The technique is demonstrated through two case studies
                 of specific video context, namely open signed content
                 and football sequences. Eye-tracking analysis is
                 employed for identifying the characteristics of each
                 context, which are subsequently exploited for coding
                 purposes, either directly or through a gaze prediction
                 model. The framework is shown to achieve a considerable
                 improvement in coding efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "applications; context-based video coding; eye
                 tracking; multimedia perceptual quality; subjective
                 video quality; transformation of eye movements into
                 useful knowledge",
}

@Article{Urruty:2007:DEF,
  author =       "Thierry Urruty and Stanislas Lew and Nacim Ihadaddene
                 and Dan A. Simovici",
  title =        "Detecting eye fixations by projection clustering",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "5:1--5:20",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314308",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Eye movements are certainly the most natural and
                 repetitive movement of a human being. The most mundane
                 activity, such as watching television or reading a
                 newspaper, involves this automatic activity which
                 consists of shifting our gaze from one point to
                 another.\par

                 Identification of the components of eye movements
                 (fixations and saccades) is an essential part in the
                 analysis of visual behavior because these types of
                 movements provide the basic elements used by further
                 investigations of human vision.\par

                 However, many of the algorithms that detect fixations
                 present a number of problems. In this article, we
                 present a new fixation identification technique that is
                 based on clustering of eye positions, using projections
                 and projection aggregation applied to static pictures.
                 We also present a new method that computes dispersion
                 of eye fixations in videos considering a multiuser
                 environment.\par

                 To demonstrate the performance and usefulness of our
                 approach we discuss our experimental work with two
                 different applications: on fixed image and video.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "eye fixations; interaction modeling; projected
                 clustering; static pictures; videos",
}

@Article{Duchowski:2007:FGC,
  author =       "Andrew T. Duchowski and Arzu {\c{C}}{\"o}ltekin",
  title =        "Foveated gaze-contingent displays for peripheral {LOD}
                 management, {$3$D} visualization, and stereo imaging",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "6:1--6:18",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314309",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Advancements in graphics hardware have allowed
                 development of hardware-accelerated imaging displays.
                 This article reviews techniques for real-time
                 simulation of arbitrary visual fields over still images
                 and video. The goal is to provide the vision sciences
                 and perceptual graphics communities techniques for the
                 investigation of fundamental processes of visual
                 perception. Classic gaze-contingent displays used for
                 these purposes are reviewed and for the first time a
                 pixel shader is introduced for display of a
                 high-resolution window over peripherally degraded
                 stimulus. The pixel shader advances current
                 state-of-the-art by allowing real-time processing of
                 still or streamed images, obviating the need for
                 preprocessing or storage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "eye tracking; foveation; gaze-contingent displays;
                 level-of-detail",
}

@Article{Loschky:2007:HLC,
  author =       "Lester C. Loschky and Gary S. Wolverton",
  title =        "How late can you update gaze-contingent
                 multiresolutional displays without detection?",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "7:1--7:10",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314310",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This study investigated perceptual disruptions in
                 gaze-contingent multiresolutional displays (GCMRDs) due
                 to delays in updating the center of highest resolution
                 after an eye movement. GCMRDs can be used to save
                 processing resources and transmission bandwidth in many
                 types of single-user display applications, such as
                 virtual reality, video-telephony, simulators, and
                 remote piloting. The current study found that image
                 update delays as late as 60 ms after an eye movement
                 did not significantly increase the detectability of
                 image blur and/or motion transients due to the update.
                 This is good news for designers of GCMRDs, since 60 ms
                 is ample time to update many GCMRDs after an eye
                 movement without disrupting perception. The study also
                 found that longer eye movements led to greater blur
                 and/or transient detection due to moving the eyes
                 further into the low-resolution periphery, effectively
                 reducing the image resolution at fixation prior to the
                 update. In GCMRD applications where longer saccades are
                 more likely (e.g., displays with relatively large
                 distances between objects), this problem could be
                 overcome by increasing the size of the region of
                 highest resolution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "area of interest; bandwidth; blur detection; contrast
                 thresholds; display updates; eye movements; eye
                 tracking; foveated; foveation; gaze-contingent;
                 level-of-detail; multiresolution; perceptual
                 compression; peripheral vision; saccades; saccadic
                 suppression; visual perception",
}

@Article{Murray:2007:AEG,
  author =       "Norman Murray and Dave Roberts and Anthony Steed and
                 Paul Sharkey and Paul Dickerson and John Rae",
  title =        "An assessment of eye-gaze potential within immersive
                 virtual environments",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "8:1--8:17",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314311",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In collaborative situations, eye gaze is a critical
                 element of behavior which supports and fulfills many
                 activities and roles. In current computer-supported
                 collaboration systems, eye gaze is poorly supported.
                 Even in a state-of-the-art video conferencing system
                 such as the access grid, although one can see the face
                 of the user, much of the communicative power of eye
                 gaze is lost. This article gives an overview of some
                 preliminary work that looks towards integrating eye
                 gaze into an immersive collaborative virtual
                 environment and assessing the impact that this would
                 have on interaction between the users of such a system.
                 Three experiments were conducted to assess the efficacy
                 of eye gaze within immersive virtual environments. In
                 each experiment, subjects observed on a large screen
                 the eye-gaze behavior of an avatar. The eye-gaze
                 behavior of that avatar had previously been recorded
                 from a user with the use of a head-mounted eye tracker.
                 The first experiment was conducted to assess the
                 difference between users' abilities to judge what
                 objects an avatar is looking at with only head gaze
                 being viewed and also with eye- and head-gaze data
                 being displayed. The results from the experiment show
                 that eye gaze is of vital importance to the subjects,
                 correctly identifying what a person is looking at in an
                 immersive virtual environment. The second experiment
                 examined whether a monocular or binocular eye-tracker
                 would be required. This was examined by testing
                 subjects' ability to identify where an avatar was
                 looking from their eye direction alone, or by eye
                 direction combined with convergence. This experiment
                 showed that convergence had a significant impact on the
                 subjects' ability to identify where the avatar was
                 looking. The final experiment looked at the effects of
                 stereo and mono-viewing of the scene, with the subjects
                 being asked to identify where the avatar was looking.
                 This experiment showed that there was no difference in
                 the subjects' ability to detect where the avatar was
                 gazing. This is followed by a description of how the
                 eye-tracking system has been integrated into an
                 immersive collaborative virtual environment and some
                 preliminary results from the use of such a system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "eye gaze; immersive virtual environments",
}

@Article{Rachovides:2007:CIM,
  author =       "Dorothy Rachovides and James Walkerdine and Peter
                 Phillips",
  title =        "The conductor interaction method",
  journal =      j-TOMCCAP,
  volume =       "3",
  number =       "4",
  pages =        "9:1--9:23",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1314303.1314312",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:11:20 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Computers have increasingly become part of our
                 everyday lives, with many activities either involving
                 their direct use or being supported by one. This has
                 prompted research into developing methods and
                 mechanisms to assist humans in interacting with
                 computers (human-computer interaction, or HCI). A
                 number of HCI techniques have been developed over the
                 years, some of which are quite old but continue to be
                 used, and some more recent and still evolving. Many of
                 these interaction techniques, however, are not natural
                 in their use and typically require the user to learn a
                 new means of interaction. Inconsistencies within these
                 techniques and the restrictions they impose on user
                 creativity can also make such interaction techniques
                 difficult to use, especially for novice users.\par

                 This article proposes an alternative interaction
                 method, the conductor interaction method (CIM), which
                 aims to provide a more natural and easier-to-learn
                 interaction technique. This novel interaction method
                 extends existing HCI methods by drawing upon techniques
                 found in human-human interaction. It is argued that the
                 use of a two-phased multimodal interaction mechanism,
                 using gaze for selection and gesture for manipulation,
                 incorporated within a metaphor-based environment, can
                 provide a viable alternative for interacting with a
                 computer (especially for novice users). Both the model
                 and an implementation of the CIM within a system are
                 presented in this article. This system formed the basis
                 of a number of user studies that have been performed to
                 assess the effectiveness of the CIM, the findings of
                 which are discussed in this work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "gaze- and gesture-based interfaces; human-computer
                 interaction",
}

@Article{Luo:2008:IFH,
  author =       "Hangzai Luo and Yuli Gao and Xiangyang Xue and Jinye
                 Peng and Jianping Fan",
  title =        "Incorporating feature hierarchy and boosting to
                 achieve more effective classifier training and
                 concept-oriented video summarization and skimming",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324288",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For online medical education purposes, we have
                 developed a novel scheme to incorporate the results of
                 semantic video classification to select the most
                 representative video shots for generating
                 concept-oriented summarization and skimming of surgery
                 education videos. First, salient objects are used as
                 the video patterns for feature extraction to achieve a
                 good representation of the intermediate video
                 semantics. The salient objects are defined as the
                 salient video compounds that can be used to
                 characterize the most significant perceptual properties
                 of the corresponding real world physical objects in a
                 video, and thus the appearances of such salient objects
                 can be used to predict the appearances of the relevant
                 semantic video concepts in a specific video domain.
                 Second, a novel multi-modal boosting algorithm is
                 developed to achieve more reliable video classifier
                 training by incorporating feature hierarchy and
                 boosting to dramatically reduce both the training cost
                 and the size of training samples, thus it can
                 significantly speed up SVM (support vector machine)
                 classifier training. In addition, the unlabeled samples
                 are integrated to reduce the human efforts on labeling
                 large amount of training samples. Finally, the results
                 of semantic video classification are incorporated to
                 enable concept-oriented video summarization and
                 skimming. Experimental results in a specific domain of
                 surgery education videos are provided.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "concept-oriented video skimming; feature hierarchy;
                 multi-modal boosting; salient objects; semantic video
                 classification; unlabeled samples",
}

@Article{Hefeeda:2008:RDO,
  author =       "Mohamed Hefeeda and Cheng-Hsin Hsu",
  title =        "Rate-distortion optimized streaming of fine-grained
                 scalable video sequences",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324289",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We present optimal schemes for allocating bits of
                 fine-grained scalable video sequences among multiple
                 senders streaming to a single receiver. This allocation
                 problem is critical in optimizing the perceived quality
                 in peer-to-peer and distributed multi-server streaming
                 environments. Senders in such environments are
                 heterogeneous in their outgoing bandwidth and they hold
                 different portions of the video stream. We first
                 formulate and optimally solve the problem for
                 individual frames, then we generalize to the multiple
                 frame case. Specifically, we formulate the allocation
                 problem as an optimization problem, which is nonlinear
                 in general. We use rate-distortion models in the
                 formulation to achieve the minimum distortion in the
                 rendered video, constrained by the outgoing bandwidth
                 of senders, availability of video data at senders, and
                 incoming bandwidth of receiver. We show how the adopted
                 rate-distortion models transform the nonlinear problem
                 to an integer linear programming (ILP) problem. We then
                 design a simple rounding scheme that transforms the ILP
                 problem to a linear programming (LP) one, which can be
                 solved efficiently using common optimization techniques
                 such as the Simplex method. We prove that our rounding
                 scheme always produces a feasible solution, and the
                 solution is within a negligible margin from the optimal
                 solution. We also propose a new algorithm (FGSAssign)
                 for the single-frame allocation problem that runs in $
                 O(n \log n) $ steps, where n is the number of senders.
                 We prove that FGSAssign is optimal. Furthermore, we
                 propose a heuristic algorithm (mFGSAssign) that
                 produces near-optimal solutions for the multiple-frame
                 case, and runs an order of magnitude faster than the
                 optimal one. Because of its short running time,
                 mFGSAssign can be used in real time. Our experimental
                 study validates our analytical analysis and shows the
                 effectiveness of our allocation algorithms in improving
                 the video quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "distributed streaming; FGS; fine-grained scalable
                 streaming; peer-to-peer streaming; rate-distortion
                 models; rate-distortion optimized streaming; video
                 streaming",
}

@Article{Babich:2008:VQE,
  author =       "Fulvio Babich and Marco D'orlando and Francesca
                 Vatta",
  title =        "Video quality estimation in wireless {IP} networks:
                 {Algorithms} and applications",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324290",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes three methods to estimate the
                 distortion deriving from packet losses in wireless
                 video communication. The proposed methods take into
                 account the short-term properties of the encoded video
                 sequences. A suitable set of functions is adopted to
                 model the distortion envelope resulting from multiple
                 losses. The estimated performance is compared with the
                 actual distortion, evaluated by decoding the received
                 sequence with a properly designed decoder. Numerical
                 results confirm the accuracy of the proposed models in
                 approximating the actual Mean Square Error (MSE) for a
                 wide range of loss rates. Some applications of the
                 proposed algorithms are presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "distortion estimation; error-concealment;
                 error-resilience; H.264; packet loss rate; real time
                 video; wireless networks",
}

@Article{Kotharu:2008:PQR,
  author =       "Phani S. Kotharu and B. Prabhakaran",
  title =        "Partial query resolution for animation authoring",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324291",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Animations are a part of multimedia and techniques
                 such as motion mapping and inverse kinematics aid in
                 reusing models and motion sequences to create new
                 animations. This reuse approach is facilitated by the
                 use of content-based retrieval techniques that often
                 require fuzzy query resolution. Most fuzzy query
                 resolution approaches work on all the attributes of the
                 query to minimize the database access cost thus
                 resulting in an unsatisfactory result set. It turns out
                 that the query resolution can be carried out in a
                 partial manner to achieve user satisfactory results and
                 aid in easy authoring. In this article, we present two
                 partial fuzzy query resolution approaches, one that
                 results in high-quality animations and the other that
                 produces results with decreasing number of satisfied
                 conditions in the query.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "aggregation function; animation toolkit; fuzzy query;
                 multimedia authoring; partial ordering; top-k query",
}

@Article{Ip:2008:RRS,
  author =       "Alan T. S. Ip and John C. S. Lui and Jiangchuan Liu",
  title =        "A revenue-rewarding scheme of providing incentive for
                 cooperative proxy caching for media streaming systems",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324292",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Network entities cooperating together can improve
                 system performance of media streaming. In this paper,
                 we address the ``incentive issue'' of a cooperative
                 proxy caching system and how to motivate each proxy to
                 provide cache space to the system. To encourage proxies
                 to participate, we propose a ``revenue-rewarding
                 scheme'' to credit the cooperative proxies according to
                 the resources they contribute. A game-theoretic model
                 is used to analyze the interactions among proxies under
                 the revenue-rewarding scheme. We propose two
                 cooperative game settings that lead to optimal
                 situations. In particular, (1) We propose a distributed
                 incentive framework for peers to participate in
                 resource contribution for media streaming; (2) Proxies
                 are encouraged to cooperate under the revenue-rewarding
                 scheme; (3) Profit and social welfare are maximized in
                 these cooperative games; and (4) Cost-effective
                 resource allocation is achieved in these cooperative
                 games. Large scale simulation is carried out to
                 validate and verify the merits of our proposed
                 incentive schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "game-theoretic analysis; incentive mechanism; Nash
                 equilibrium; pricing; resource allocation",
}

@Article{Zhang:2008:AEE,
  author =       "Cha Zhang and Yong Rui and Jim Crawford and Li-Wei
                 He",
  title =        "An automated end-to-end lecture capture and
                 broadcasting system",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324293",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Remote viewing of lectures presented to a live
                 audience is becoming increasingly popular. At the same
                 time, the lectures can be recorded for subsequent
                 on-demand viewing over the Internet. Providing such
                 services, however, is often prohibitive due to the
                 labor-intensive cost of capturing and
                 pre/post-processing. This article presents a complete
                 automated end-to-end system that supports capturing,
                 broadcasting, viewing, archiving and searching of
                 presentations. Specifically, we describe a system
                 architecture that minimizes the pre- and
                 post-production time, and a fully automated lecture
                 capture system called iCam2 that synchronously captures
                 all contents of the lecture, including audio, video,
                 and presentation material. No staff is needed during
                 lecture capture and broadcasting, so the operational
                 cost of the system is negligible. The system has been
                 used on a daily basis for more than 4 years, during
                 which 522 lectures have been captured. These lectures
                 have been viewed over 20,000 times.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "automated lecture capture; lecture broadcasting;
                 live/on-demand broadcasting",
}

@Article{Nguyen:2008:OIV,
  author =       "Giang Phuong Nguyen and Marcel Worring",
  title =        "Optimization of interactive visual-similarity-based
                 search",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324294",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "At one end of the spectrum, research in interactive
                 content-based retrieval concentrates on machine
                 learning methods for effective use of relevance
                 feedback. On the other end, the information
                 visualization community focuses on effective methods
                 for conveying information to the user. What is lacking
                 is research considering the information visualization
                 and interactive retrieval as truly integrated parts of
                 one content-based search system. In such an integrated
                 system, there are many degrees of freedom like the
                 similarity function, the number of images to display,
                 the image size, different visualization modes, and
                 possible feedback modes. To base the optimal values for
                 all of those on user studies is unfeasible. We
                 therefore develop search scenarios in which tasks and
                 user actions are simulated. From there, the proposed
                 scheme is optimized based on objective constraints and
                 evaluation criteria. In such a manner, the degrees of
                 freedom are reduced and the remaining degrees can be
                 evaluated in user studies. In this article, we present
                 a system that integrates advanced similarity based
                 visualization with active learning. We have performed
                 extensive experimentation on interactive category
                 search with different image collections. The results
                 using the proposed simulation scheme show that indeed
                 the use of advanced visualization and active learning
                 pays off in all of these datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "active learning; interactive search; similarity based
                 visualization",
}

@Article{Hlavacs:2008:HVP,
  author =       "Helmut Hlavacs and Shelley Buchinger",
  title =        "Hierarchical video patching with optimal server
                 bandwidth",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324287.1324295",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:06 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video patching is a way for transporting true
                 video-on-demand, that is, instantaneous without any
                 delay, from a video server to several clients. Instead
                 of sending a unique stream to each newly arriving
                 client, clients share as many multicast transmissions
                 as possible, and are serviced only those parts of the
                 video that they have missed.\par

                 We present a novel video patching scheme using
                 hierarchies of patches. Our scheme minimizes the
                 bandwidth needed by the video server, and may result in
                 the fact that clients receive several streams in
                 parallel. We show analytically that for Poisson arrival
                 our algorithm achieves the optimal possible server
                 bandwidth for all schemes where clients share multicast
                 transmissions.\par

                 We also show, how our approach can be combined with
                 batching. This combination requires less server
                 bandwidth than all fixed start point periodic broadcast
                 algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "batching; server bandwidth; true video-on-demand;
                 video patching",
}

@Article{Chen:2008:ASD,
  author =       "Songqing Chen and Shiping Chen and Huiping Guo and Bo
                 Shen and Sushil Jajodia",
  title =        "Achieving simultaneous distribution control and
                 privacy protection for {Internet} media delivery",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352013",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Massive Internet media distribution demands prolonged
                 continuous consumption of networking and disk
                 bandwidths in large capacity. Many proxy-based Internet
                 media distribution algorithms and systems have been
                 proposed, implemented, and evaluated to address the
                 scalability and performance issue. However, few of them
                 have been used in practice, since two important issues
                 are not satisfactorily addressed. First, existing
                 proxy-based media distribution architectures lack an
                 efficient media distribution control mechanism. Without
                 copyright protection, content providers are hesitant to
                 use proxy-based fast distribution techniques. Second,
                 little has been done to protect client privacy during
                 content accesses on the Internet. Straightforward
                 solutions to address these two issues independently
                 lead to conflicts. For example, to enforce distribution
                 control, only legitimate users should be granted access
                 rights. However, this normally discloses more
                 information (such as which object the client is
                 accessing) other than the client identity, which
                 conflicts with the client's desire for privacy
                 protection. In this article, we propose a unified
                 proxy-based media distribution protocol to effectively
                 address these two problems simultaneously. We further
                 design a set of new algorithms in a cooperative proxy
                 environment where our proposed scheme works efficiently
                 and practically. Simulation-based experiments are
                 conducted to extensively evaluate the proposed system.
                 Preliminary results demonstrate the effectiveness of
                 our proposed strategy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "cooperative proxy; distribution control; media
                 delivery; privacy; proxy caching",
}

@Article{Li:2008:FSE,
  author =       "Rui Li and Bir Bhanu and Anlei Dong",
  title =        "Feature synthesized {EM} algorithm for image
                 retrieval",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352014",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "As a commonly used unsupervised learning algorithm in
                 Content-Based Image Retrieval (CBIR),
                 Expectation-Maximization (EM) algorithm has several
                 limitations, including the curse of dimensionality and
                 the convergence at a local maximum. In this article, we
                 propose a novel learning approach, namely
                 Coevolutionary Feature Synthesized
                 Expectation-Maximization (CFS-EM), to address the above
                 problems. The CFS-EM is a hybrid of coevolutionary
                 genetic programming (CGP) and EM algorithm applied on
                 partially labeled data. CFS-EM is especially suitable
                 for image retrieval because the images can be searched
                 in the synthesized low-dimensional feature space, while
                 a kernel-based method has to make classification
                 computation in the original high-dimensional space.
                 Experiments on real image databases show that CFS-EM
                 outperforms Radial Basis Function Support Vector
                 Machine (RBF-SVM), CGP, Discriminant-EM (D-EM) and
                 Transductive-SVM (TSVM) in the sense of classification
                 performance and it is computationally more efficient
                 than RBF-SVM in the query phase.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "coevolutionary feature synthesis; content-based image
                 retrieval; expectation maximization; semi-supervised
                 learning",
}

@Article{Xu:2008:AKG,
  author =       "Min Xu and Changsheng Xu and Lingyu Duan and Jesse S.
                 Jin and Suhuai Luo",
  title =        "Audio keywords generation for sports video analysis",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352015",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Sports video has attracted a global viewership.
                 Research effort in this area has been focused on
                 semantic event detection in sports video to facilitate
                 accessing and browsing. Most of the event detection
                 methods in sports video are based on visual features.
                 However, being a significant component of sports video,
                 audio may also play an important role in semantic event
                 detection. In this paper, we have borrowed the concept
                 of the ``keyword'' from the text mining domain to
                 define a set of specific audio sounds. These specific
                 audio sounds refer to a set of game-specific sounds
                 with strong relationships to the actions of players,
                 referees, commentators, and audience, which are the
                 reference points for interesting sports events. Unlike
                 low-level features, audio keywords can be considered as
                 a mid-level representation, able to facilitate
                 high-level analysis from the semantic concept point of
                 view. Audio keywords are created from low-level audio
                 features with learning by support vector machines. With
                 the help of video shots, the created audio keywords can
                 be used to detect semantic events in sports video by
                 Hidden Markov Model (HMM) learning. Experiments on
                 creating audio keywords and, subsequently, event
                 detection based on audio keywords have been very
                 encouraging. Based on the experimental results, we
                 believe that the audio keyword is an effective
                 representation that is able to achieve satisfying
                 results for event detection in sports video.
                 Application in three sports types demonstrates the
                 practicality of the proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "audio keywords; event detection; semantics analysis;
                 sports video analysis; support vector machines",
}

@Article{Tullimas:2008:MSU,
  author =       "Sunand Tullimas and Thinh Nguyen and Rich Edgecomb and
                 Sen-ching Cheung",
  title =        "Multimedia streaming using multiple {TCP}
                 connections",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352016",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In recent years, multimedia applications over the
                 Internet become increasingly popular. However, packet
                 loss, delay, and time-varying bandwidth of the Internet
                 have remained the major problems for multimedia
                 streaming applications. As such, a number of
                 approaches, including network infrastructure and
                 protocol, source and channel coding, have been proposed
                 to either overcome or alleviate these drawbacks of the
                 Internet. In this article, we propose the MultiTCP
                 system, a receiver-driven, TCP-based system for
                 multimedia streaming over the Internet. Our proposed
                 algorithm aims at providing resilience against short
                 term insufficient bandwidth by using multiple TCP
                 connections for the same application. Our proposed
                 system enables the application to achieve and control
                 the desired sending rate during congested periods,
                 which cannot be achieved using traditional TCP.
                 Finally, our proposed system is implemented at the
                 application layer, and hence, no kernel modification to
                 TCP is necessary. We analyze the proposed system, and
                 present simulation and experimental results to
                 demonstrate its advantages over the traditional
                 single-TCP-based approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "multimedia streaming",
}

@Article{Tjondronegoro:2008:SES,
  author =       "Dian Tjondronegoro and Yi-Ping Phoebe Chen and Adrien
                 Joly",
  title =        "A scalable and extensible segment-event-object-based
                 sports video retrieval system",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352017",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Sport video data is growing rapidly as a result of the
                 maturing digital technologies that support digital
                 video capture, faster data processing, and large
                 storage. However, (1) semi-automatic content extraction
                 and annotation, (2) scalable indexing model, and (3)
                 effective retrieval and browsing, still pose the most
                 challenging problems for maximizing the usage of large
                 video databases. This article will present the findings
                 from a comprehensive work that proposes a scalable and
                 extensible sports video retrieval system with two major
                 contributions in the area of sports video indexing and
                 retrieval. The first contribution is a new sports video
                 indexing model that utilizes semi-schema-based indexing
                 scheme on top of an Object-Relationship approach. This
                 indexing model is scalable and extensible as it enables
                 gradual index construction which is supported by
                 ongoing development of future content extraction
                 algorithms. The second contribution is a set of novel
                 queries which are based on XQuery to generate dynamic
                 and user-oriented summaries and event structures. The
                 proposed sports video retrieval system has been fully
                 implemented and populated with soccer, tennis,
                 swimming, and diving video. The system has been
                 evaluated against 20 users to demonstrate and confirm
                 its feasibility and benefits. The experimental sports
                 genres were specifically selected to represent the four
                 main categories of sports domain: period-, set-point-,
                 time (race)-, and performance-based sports. Thus, the
                 proposed system should be generic and robust for all
                 types of sports.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "automatic content extraction; indexing; mobile video
                 interaction; MPEG-7; sports video retrieval; video
                 database system; XML; XQuery",
}

@Article{Zimmermann:2008:DMP,
  author =       "Roger Zimmermann and Elaine Chew and Sakire Arslan Ay
                 and Moses Pawar",
  title =        "Distributed musical performances: {Architecture} and
                 stream management",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352018",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "An increasing number of novel applications produce a
                 rich set of different data types that need to be
                 managed efficiently and coherently. In this article we
                 present our experience with designing and implementing
                 a data management infrastructure for a distributed
                 immersive performance (DIP) application. The DIP
                 project investigates a versatile framework for the
                 capture, recording, and replay of video, audio, and
                 MIDI (Musical Instrument Digital Interface) streams in
                 an interactive environment for collaborative music
                 performance. We are focusing on two classes of data
                 streams that are generated within this environment. The
                 first category consists of high-resolution isochronous
                 media streams, namely audio and video. The second class
                 comprises MIDI data produced by electronic instruments.
                 MIDI event sequences are alphanumeric in nature and
                 fall into the category of the data streams that have
                 been of interest to data management researchers in
                 recent years.\par

                 We present our data management architecture, which
                 provides a repository for all DIP data. Streams of both
                 categories need to be acquired, transmitted, stored,
                 and replayed in real time. Data items are correlated
                 across different streams with temporal indices. The
                 audio and video streams are managed in our own
                 High-performance Data Recording Architecture (HYDRA),
                 which integrates multistream recording and retrieval in
                 a consistent manner. This paper reports on the
                 practical issues and challenges that we encountered
                 during the design, implementation and experimental
                 phases of our prototype. We also present some analysis
                 results and discuss future extensions for the
                 architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "distributed immersive performance; multimedia storage;
                 multimodal data recorder; networked musical
                 performance",
}

@Article{Hsu:2008:ACR,
  author =       "Cheng-Hsin Hsu and Mohamed Hefeeda",
  title =        "On the accuracy and complexity of rate-distortion
                 models for fine-grained scalable video sequences",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352019",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Rate-distortion (R-D) models are functions that
                 describe the relationship between the bitrate and
                 expected level of distortion in the reconstructed video
                 stream. R-D models enable optimization of the received
                 video quality in different network conditions. Several
                 R-D models have been proposed for the increasingly
                 popular fine-grained scalable video sequences. However,
                 the models' relative performance has not been
                 thoroughly analyzed. Moreover, the time complexity of
                 each model is not known, nor is the range of bitrates
                 in which the model produces valid results. This lack of
                 quantitative performance analysis makes it difficult to
                 select the model that best suits a target streaming
                 system. In this article, we classify, analyze, and
                 rigorously evaluate all R-D models proposed for FGS
                 coders in the literature. We classify R-D models into
                 three categories: analytic, empirical, and
                 semi-analytic. We describe the characteristics of each
                 category. We analyze the R-D models by following their
                 mathematical derivations, scrutinizing the assumptions
                 made, and explaining when the assumptions fail and why.
                 In addition, we implement all R-D models, a total of
                 eight, and evaluate them using a diverse set of video
                 sequences. In our evaluation, we consider various
                 source characteristics, diverse channel conditions,
                 different encoding/decoding parameters, different frame
                 types, and several performance metrics including
                 accuracy, range of applicability, and time complexity
                 of each model. We also present clear systematic ways
                 (pseudo codes) for constructing various R-D models from
                 a given video sequence. Based on our experimental
                 results, we present a justified list of recommendations
                 on selecting the best R-D models for video-on-demand,
                 video conferencing, real-time, and peer-to-peer
                 streaming systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "fine-grained scalable coding; multimedia streaming;
                 rate-distortion models",
}

@Article{Wang:2008:MST,
  author =       "Bing Wang and Jim Kurose and Prashant Shenoy and Don
                 Towsley",
  title =        "Multimedia streaming via {TCP}: an analytic
                 performance study",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1352012.1352020",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Jun 16 17:12:37 MDT 2008",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "TCP is widely used in commercial multimedia streaming
                 systems, with recent measurement studies indicating
                 that a significant fraction of Internet streaming media
                 is currently delivered over HTTP/TCP. These
                 observations motivate us to develop analytic
                 performance models to systematically investigate the
                 performance of TCP for both live and stored-media
                 streaming. We validate our models via ns simulations
                 and experiments conducted over the Internet. Our models
                 provide guidelines indicating the circumstances under
                 which TCP streaming leads to satisfactory performance,
                 showing, for example, that TCP generally provides good
                 streaming performance when the achievable TCP
                 throughput is roughly twice the media bitrate, with
                 only a few seconds of startup delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "multimedia streaming; performance modeling",
}

@Article{Lin:2008:NNB,
  author =       "Tsungnan Lin and Chiapin Wang and Po-Chiang Lin",
  title =        "A neural-network-based context-aware handoff algorithm
                 for multimedia computing",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386110",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The access of multimedia computing in wireless
                 networks is concerned with the performance of handoff
                 because of the irretrievable property of real-time data
                 delivery. To lessen throughput degradation incurred by
                 unnecessary handoffs or handoff latencies leading to
                 media disruption perceived by users, this paper
                 presents a link quality based handoff algorithm. Neural
                 networks are used to learn the cross-layer correlation
                 between the link quality estimator such as packet
                 success rate and the corresponding context metric
                 indicators, for example, the transmitting packet
                 length, received signal strength, and signal to noise
                 ratio. Based on a pre-processed learning of link
                 quality profile, neural networks make essential handoff
                 decisions efficiently with the evaluations of link
                 quality instead of the comparisons between relative
                 signal strength. The experiment and simulation results
                 show that the proposed algorithm improves the user
                 perceived qualities in a transmission scenario of VoIP
                 applications by minimizing both the number of lost
                 packets and unnecessary handoffs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "context-aware; handoff; Multimedia computing; neural
                 networks",
}

@Article{Franke:2008:TAC,
  author =       "Ingmar S. Franke and Sebastian Pannasch and Jens R.
                 Helmert and Robert Rieger and Rainer Groh and Boris M.
                 Velichkovsky",
  title =        "Towards attention-centered interfaces: an aesthetic
                 evaluation of perspective with eye tracking",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386111",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The established method of representing
                 three-dimensional space on a two-dimensional surface
                 involves camera based, point of regard systems,
                 comparable in design to the early ``camera obscura''.
                 However, geometrical limitations of such models lead to
                 distortions of perspective when projected. This
                 research investigated the influence of single- versus
                 multi-perspectives on aesthetic choices within one
                 image. A clear perceptual bias towards
                 multi-perspective images was found, additionally
                 supported by an eye tracking study. We propose that
                 human users are more attracted by multi-perspective
                 images, which emphasize the ``semantic foci'' of the
                 scene, than by those being synthesized statically with
                 only one geometrical prospect.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Eye tracking; perspective projection; scene
                 perception; subjective evaluation",
}

@Article{Wu:2008:ELS,
  author =       "Chuan Wu and Baochun Li and Shuqiao Zhao",
  title =        "Exploring large-scale peer-to-peer live streaming
                 topologies",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386112",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Real-world live peer-to-peer (P2P) streaming
                 applications have been successfully deployed in the
                 Internet, delivering live multimedia content to
                 millions of users at any given time. With relative
                 simplicity in design with respect to peer selection and
                 topology construction protocols and without much
                 algorithmic sophistication, current-generation live P2P
                 streaming applications are able to provide users with
                 adequately satisfying viewing experiences. That said,
                 little existing research has provided sufficient
                 insights on the time-varying internal characteristics
                 of peer-to-peer topologies in live streaming. This
                 article presents {\em Magellan}, our collaborative work
                 with UUSee Inc., Beijing, China, for exploring and
                 charting graph theoretical properties of practical P2P
                 streaming topologies, gaining important insights in
                 their topological dynamics over a long period of
                 time.\par

                 With more than 120 GB worth of traces starting
                 September 2006 from a commercially deployed P2P live
                 streaming system that represents UUSee's core product,
                 we have completed a thorough and in-depth investigation
                 of the topological properties in large-scale live P2P
                 streaming, as well as their evolutionary behavior over
                 time, for example, at different times of the day and in
                 flash crowd scenarios. We seek to explore real-world
                 P2P streaming topologies with respect to their graph
                 theoretical metrics, such as the degree, clustering
                 coefficient, and reciprocity. In addition, we compare
                 our findings with results from existing studies on
                 topological properties of P2P file sharing
                 applications, and present new and unique observations
                 specific to streaming. We have observed that live P2P
                 streaming sessions demonstrate excellent scalability, a
                 high level of reciprocity, a clustering phenomenon in
                 each ISP, and a degree distribution that does {\em
                 not\/} follow the power-law distribution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Peer-to-peer streaming; topology characterization",
}

@Article{Goel:2008:LLA,
  author =       "Ashvin Goel and Charles Krasic and Jonathan Walpole",
  title =        "Low-latency adaptive streaming over {TCP}",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386113",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Media streaming over TCP has become increasingly
                 popular because TCP's congestion control provides
                 remarkable stability to the Internet. Streaming over
                 TCP requires adapting to bandwidth availability, but
                 unfortunately, TCP can introduce significant latency at
                 the application level, which causes unresponsive and
                 poor adaptation. This article shows that this latency
                 is not inherent in TCP but occurs as a result of
                 throughput-optimized TCP implementations. We show that
                 this latency can be minimized by dynamically tuning
                 TCP's send buffer. Our evaluation shows that this
                 approach leads to better application-level adaptation
                 and it allows supporting interactive and other
                 low-latency applications over TCP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "low latency streaming; multimedia applications; TCP",
}

@Article{Lim:2008:DPP,
  author =       "Seung-Ho Lim and Yo-Won Jeong and Kyu Ho Park",
  title =        "Data placement and prefetching with accurate bit rate
                 control for interactive media server",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386114",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "An interactive Media Server should support
                 unrestricted control to viewers with their service
                 level agreements. It is important to manage video data
                 effectively to facilitate efficient retrieval. In this
                 paper, we propose an efficient placement algorithm as
                 part of an effective retrieval scheme to increase the
                 number of clients who can be provided with interactive
                 service. The proposed management schemes are
                 incorporated with a bit count control method that is
                 based on repeated tuning of quantization parameters to
                 adjust the actual bit count to the target bit count.
                 The encoder using this method can generate coded frames
                 whose sizes are synchronized with the RAID stripe size,
                 so that when various fast-forward levels are accessed
                 we can reduce the seek and rotational latency and
                 enhance the disk throughput of each disk in the RAID
                 system. Experimental results demonstrate that the
                 proposed schemes can significantly improve the average
                 service time and guarantee more users service of
                 quality, and the interactive media server can thereby
                 efficiently service a large number of clients.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "bit count control; disk array; Interactive media
                 server; stripe size; video rate",
}

@Article{Jie:2008:VGD,
  author =       "Li Jie and James J. Clark",
  title =        "Video game design using an eye-movement-dependent
                 model of visual attention",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386115",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Eye movements can be used to infer the allocation of
                 covert attention. In this article, we propose to model
                 the allocation of attention in a task-dependent manner
                 based on different eye movement conditions,
                 specifically fixation and pursuit. We show that the
                 image complexity at eye fixation points during
                 fixation, and the pursuit direction during pursuit are
                 significant factors in attention allocation. Results of
                 the study are applied to the design of an interactive
                 computer game. Real-time eye movement information is
                 taken as one of inputs for the game. The utility of
                 such eye information for controlling game difficulty is
                 shown.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Entertainment; eye movements; eye tracking; HCI; video
                 games; visual attention",
}

@Article{Komogortsev:2008:PRT,
  author =       "Oleg V. Komogortsev and Javed I. Khan",
  title =        "Predictive real-time perceptual compression based on
                 eye-gaze-position analysis",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1386109.1386116",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:12 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article designs a real-time perceptual
                 compression system (RTPCS) based on eye-gaze-position
                 analysis. Our results indicate that the
                 eye-gaze-position containment metric provides more
                 efficient and effective evaluation of an RTPCS than the
                 eye fixation containment. The presented RTPCS is
                 designed for a network communication scenario with a
                 feedback loop delay. The proposed RTPCS uses human
                 visual system properties to compensate for the delay
                 and to provide high ratios of multimedia compression.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "human visual system; Real-time multimedia
                 compression",
}

@Article{Cesar:2008:ISI,
  author =       "Pablo Cesar and Dick C. A. Bulterman and Luiz Fernando
                 Gomes Soares",
  title =        "Introduction to special issue: {Human-centered}
                 television --- directions in interactive digital
                 television research",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "4",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412196.1412197",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:32 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The research area of interactive digital TV is in the
                 midst of a significant revival. Unlike the first
                 generation of digital TV, which focused on producer
                 concerns that effectively limited (re)distribution, the
                 current generation of research is closely linked to the
                 role of the user in selecting, producing, and
                 distributing content. The research field of interactive
                 digital television is being transformed into a study of
                 human-centered television. Our guest editorial reviews
                 relevant aspects of this transformation in the three
                 main stages of the content lifecycle: content
                 production, content delivery, and content consumption.
                 While past research on content production tools focused
                 on full-fledged authoring tools for professional
                 editors, current research studies lightweight, often
                 informal end-user authoring systems. In terms of
                 content delivery, user-oriented infrastructures such as
                 peer-to-peer are being seen as alternatives to more
                 traditional broadcast solutions. Moreover, end-user
                 interaction is no longer limited to content selection,
                 but now facilitates nonlinear participatory television
                 productions. Finally, user-to-user communication
                 technologies have allowed television to become a
                 central component of an interconnected social
                 experience. The background context given in this
                 article provides a framework for appreciating the
                 significance of four detailed contributions that
                 highlight important directions in transforming
                 interactive television research.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Interactive television; shared experiences; standards;
                 survey",
}

@Article{Ursu:2008:ITN,
  author =       "Marian F. Ursu and Maureen Thomas and Ian Kegel and
                 Doug Williams and Mika Tuomola and Inger Lindstedt and
                 Terence Wright and Andra Leurdijk and Vilmos Zsombori
                 and Julia Sussner and Ulf Myrestam and Nina Hall",
  title =        "Interactive {TV} narratives: {Opportunities},
                 progress, and challenges",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "4",
  pages =        "25:1--25:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412196.1412198",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:32 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article is motivated by the question whether
                 television should do more than simply offer interactive
                 services alongside (and separately from) traditional
                 linear programs, in the context of its dominance being
                 seriously challenged and threatened by interactive
                 forms of screen media entertainment. It suggests: yes.
                 Interactive {\em narrativity}, that is, the ability to
                 interact with (and influence) stories whilst they are
                 being told, represents one clear development path for
                 interactive television. The capabilities of computing
                 technology are ripe for exploring this new form of
                 storytelling, from creation to commercial distribution.
                 The article starts by looking at the relationship
                 between narrativity and interactivity in the current
                 context of screen media, and identifies clear signs of
                 interest from certain European public broadcasters in
                 interactive TV narratives. It then presents in detail
                 four recent experimental interactive TV productions in
                 the genres of drama, news, and documentary, developed
                 in collaboration with public broadcasters, which
                 illustrate the potential and richness of this new form
                 of storytelling, but also highlight new technological
                 capabilities necessary for such productions. A number
                 of essential technological requirements are then
                 discussed in more detail in the final part. The article
                 suggests that the ShapeShifting Media Technology,
                 employed in the implementation of the four productions,
                 has made significant advances both at the technological
                 and the creative ends in supporting the development of
                 interactive TV narrativity, but, however, that further
                 developments are required before being able to answer
                 questions such as ``Would end users want such a form of
                 screen media entertainment?'' and ``Would it be
                 effective for both end users and producers?''",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "computational narrativity; digital storytelling;
                 entertainment; Interactive; media; narrativity;
                 nonlinear; screen media; shapeshifting; television",
}

@Article{Cheng:2008:GIP,
  author =       "Bin Cheng and Lex Stein and Hai Jin and Xiaofei Liao
                 and Zheng Zhang",
  title =        "{GridCast}: {Improving} peer sharing for {P2P VoD}",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "4",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412196.1412199",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:32 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video-on-Demand (VoD) is a compelling application, but
                 costly. VoD is costly due to the load it places on
                 video source servers. Many have proposed using
                 peer-to-peer (P2P) techniques to shift load from
                 servers to peers. Yet, nobody has implemented and
                 deployed a system to openly and systematically evaluate
                 how these techniques work.\par

                 This article describes the design, implementation and
                 evaluation of GridCast, a real deployed P2P VoD system.
                 GridCast has been live on CERNET since May of 2006. It
                 provides seek, pause, and play operations, and employs
                 peer sharing to improve system scalability. In peak
                 months, GridCast has served videos to 23,000 unique
                 users. From the first deployment, we have gathered
                 information to understand the system and evaluate how
                 to further improve peer sharing through caching and
                 replication.\par

                 We first show that GridCast with single video caching
                 (SVC) can decrease load on source servers by an average
                 of 22\% from a client-server architecture. We analyze
                 the net effect on system resources and determine that
                 peer upload is largely idle. This leads us to changing
                 the caching algorithm to cache multiple videos (MVC).
                 MVC decreases source load by an average of 51\% over
                 the client-server. The improvement is greater as user
                 load increases. This bodes well for peer-assistance at
                 larger scales.\par

                 A detailed analysis of MVC shows that departure misses
                 become a major issue in a P2P VoD system with caching
                 optimization. Motivated by this observation, we examine
                 how to use replication to eliminate departure misses
                 and further reduce server load. A framework for lazy
                 replication is presented and evaluated in this article.
                 In this framework, two predictors are plugged in to
                 create the working replication algorithm. With these
                 two simple predictors, lazy replication can decrease
                 server load by 15\% from MVC with only a minor increase
                 in network traffic.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "caching; peer-to-peer; replication; Video-on-demand",
}

@Article{Metcalf:2008:EPL,
  author =       "Crysta Metcalf and Gunnar Harboe and Joe Tullio and
                 Noel Massey and Guy Romano and Elaine M. Huang and
                 Frank Bentley",
  title =        "Examining presence and lightweight messaging in a
                 social television experience",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "4",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412196.1412200",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:32 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We report on a field evaluation of a prototype social
                 television system (Social TV) that incorporates
                 lightweight messaging as well as ambient awareness of
                 user presence on the system. This evaluation was
                 conducted over a two-week period and involved the
                 participation of ten households. Participants
                 appreciated the ability to see their buddies' presence
                 on the system, the ability to see or suggest the
                 programs they were currently watching, and the ability
                 to send short messages to one another. The presence
                 facilities available in Social TV also allowed
                 participants to learn more about one another's TV
                 viewing habits and preferences, and fostered a sense of
                 connectedness between them. However, they also felt
                 constrained by the limitations of the communication
                 options available to them and demanded free-form text
                 or voice chat to be able to fully express themselves.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "ambient displays; awareness displays;
                 computer-mediated communication; Social television",
}

@Article{Cattelan:2008:WCP,
  author =       "Renan G. Cattelan and Cesar Teixeira and Rudinei
                 Goularte and Maria Da Gra{\c{c}}a C. Pimentel",
  title =        "Watch-and-comment as a paradigm toward ubiquitous
                 interactive video editing",
  journal =      j-TOMCCAP,
  volume =       "4",
  number =       "4",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412196.1412201",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:32 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The literature reports research efforts allowing the
                 editing of interactive TV multimedia documents by
                 end-users. In this article we propose complementary
                 contributions relative to end-user generated
                 interactive video, video tagging, and collaboration. In
                 earlier work we proposed the {\em watch-and-comment\/}
                 (WaC) paradigm as the seamless capture of an
                 individual's comments so that corresponding annotated
                 interactive videos be automatically generated. As a
                 proof of concept, we implemented a prototype
                 application, the WaCTool, that supports the capture of
                 digital ink and voice comments over individual frames
                 and segments of the video, producing a declarative
                 document that specifies both: different media stream
                 structure and synchronization.\par

                 In this article, we extend the WaC paradigm in two
                 ways. First, user-video interactions are associated
                 with edit commands and digital ink operations. Second,
                 focusing on collaboration and distribution issues, we
                 employ annotations as simple containers for context
                 information by using them as tags in order to organize,
                 store and distribute information in a P2P-based
                 multimedia capture platform. We highlight the design
                 principles of the watch-and-comment paradigm, and
                 demonstrate related results including the current
                 version of the WaCTool and its architecture. We also
                 illustrate how an interactive video produced by the
                 WaCTool can be rendered in an interactive video
                 environment, the Ginga-NCL player, and include results
                 from a preliminary evaluation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Annotation; Ginga-NCL; interactive digital video; P2P
                 collaboration",
}

@Article{Bailey:2008:SSA,
  author =       "Brian P. Bailey and Nicu Sebe and Alan Hanjalic",
  title =        "Special section from the {ACM Multimedia Conference
                 2007}",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404881",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gleicher:2008:RCI,
  author =       "Michael L. Gleicher and Feng Liu",
  title =        "Re-cinematography: {Improving} the camerawork of
                 casual video",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404882",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents an approach to postprocessing
                 casually captured videos to improve apparent camera
                 movement. {\em Re-cinematography\/} transforms each
                 frame of a video such that the video better follows
                 cinematic conventions. The approach breaks a video into
                 shorter segments. Segments of the source video where
                 there is no intentional camera movement are made to
                 appear as if the camera is completely static. For
                 segments with camera motions, camera paths are
                 keyframed automatically and interpolated with matrix
                 logarithms to give velocity-profiled movements that
                 appear intentional and directed. Closeups are inserted
                 to provide compositional variety in otherwise uniform
                 segments. The approach automatically balances the
                 tradeoff between motion smoothness and distortion to
                 the original imagery. Results from our prototype show
                 improvements to poor quality home videos.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "casual video; cinematography; Image stabilization",
}

@Article{Qi:2008:CMV,
  author =       "Guo-Jun Qi and Xian-Sheng Hua and Yong Rui and Jinhui
                 Tang and Tao Mei and Meng Wang and Hong-Jiang Zhang",
  title =        "Correlative multilabel video annotation with temporal
                 kernels",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404883",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Automatic video annotation is an important ingredient
                 for semantic-level video browsing, search and
                 navigation. Much attention has been paid to this topic
                 in recent years. These researches have evolved through
                 two paradigms. In the first paradigm, each concept is
                 individually annotated by a pre-trained binary
                 classifier. However, this method ignores the rich
                 information between the video concepts and only
                 achieves limited success. Evolved from the first
                 paradigm, the methods in the second paradigm add an
                 extra step on the top of the first individual
                 classifiers to fuse the multiple detections of the
                 concepts. However, the performance of these methods can
                 be degraded by the error propagation incurred in the
                 first step to the second fusion one. In this article,
                 another paradigm of the video annotation method is
                 proposed to address these problems. It simultaneously
                 annotates the concepts as well as model correlations
                 between them in one step by the proposed {\em
                 Correlative Multilabel\/} (CML) method, which benefits
                 from the compensation of complementary information
                 between different labels. Furthermore, since the video
                 clips are composed by temporally ordered frame
                 sequences, we extend the proposed method to exploit the
                 rich temporal information in the videos. Specifically,
                 a temporal-kernel is incorporated into the CML method
                 based on the discriminative information between {\em
                 Hidden Markov Models\/} (HMMs) that are learned from
                 the videos. We compare the performance between the
                 proposed approach and the state-of-the-art approaches
                 in the first and second paradigms on the widely used
                 TRECVID data set. As to be shown, superior performance
                 of the proposed method is gained.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "concept correlation; multilabeling; temporal kernel;
                 Video annotation",
}

@Article{Chen:2008:DDN,
  author =       "Yinpeng Chen and Weiwei Xu and Hari Sundaram and
                 Thanassis Rikakis and Sheng-Min Liu",
  title =        "A dynamic decision network framework for online media
                 adaptation in stroke rehabilitation",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404884",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we present a media adaptation
                 framework for an immersive biofeedback system for
                 stroke patient rehabilitation. In our biofeedback
                 system, media adaptation refers to changes in
                 audio/visual feedback as well as changes in physical
                 environment. Effective media adaptation frameworks help
                 patients recover generative plans for arm movement with
                 potential for significantly shortened therapeutic time.
                 The media adaptation problem has significant challenges
                 --- (a) high dimensionality of adaptation parameter
                 space; (b) variability in the patient performance
                 across and within sessions; (c) the actual
                 rehabilitation plan is typically a non-first-order
                 Markov process, making the learning task hard.\par

                 Our key insight is to understand media adaptation as a
                 real-time feedback control problem. We use a
                 mixture-of-experts based Dynamic Decision Network (DDN)
                 for online media adaptation. We train DDN mixtures per
                 patient, per session. The mixture models address two
                 basic questions --- (a) given a specific adaptation
                 suggested by the domain experts, predict the patient
                 performance, and (b) given the expected performance,
                 determine the optimal adaptation decision. The
                 questions are answered through an optimality criterion
                 based search on DDN models trained in previous
                 sessions. We have also developed new validation metrics
                 and have very good results for both questions on actual
                 stroke rehabilitation data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Biofeedback; dynamic decision network; media
                 adaptation; mixture of experts",
}

@Article{Thouin:2008:EAV,
  author =       "Frederic Thouin and Mark Coates",
  title =        "Equipment allocation in video-on-demand network
                 deployments",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404885",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video-on-Demand (VoD) services are very user-friendly,
                 but also complex and resource demanding. Deployments
                 involve careful design of many mechanisms where content
                 attributes and usage models should be taken into
                 account. We define, and propose a methodology to solve,
                 the {\em VoD Equipment Allocation Problem\/} of
                 determining the number and type of streaming servers
                 with directly attached storage (VoD servers) to install
                 at each potential location in a metropolitan area
                 network topology such that deployment costs are
                 minimized. We develop a cost model for VoD deployments
                 based on streaming, storage and transport costs and
                 train a parametric function that maps the amount of
                 available storage to a worst-case hit ratio. We observe
                 the impact of having to determine the amount of storage
                 and streaming cojointly, and determine the minimum
                 demand required to deploy replicas as well as the
                 average hit ratio at each location. We observe that
                 common video-on-demand server configurations lead to
                 the installation of excessive storage, because a
                 relatively high hit-ratio can be achieved with small
                 amounts of storage so streaming requirements
                 dominate.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "equipment allocation; optimization; resource
                 allocation; Video-on-demand",
}

@Article{Kolan:2008:NLV,
  author =       "Prakash Kolan and Ram Dantu and Jo{\~a}o W. Cangussu",
  title =        "Nuisance level of a voice call",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404886",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In our everyday life, we communicate with many people
                 such as family, friends, neighbors, and colleagues. We
                 communicate with them using different communication
                 media such as email, telephone calls, and face-to-face
                 interactions. While email is not real-time and
                 face-to-face communications require geographic
                 proximity, voice and video communications are preferred
                 over other modes of communication. However, real-time
                 voice/video calls may create nuisance to the receiver.
                 In this article, we describe a mathematical model for
                 computing nuisance level of incoming voice/video calls.
                 We computed the closeness and nuisance level using the
                 calling patterns between the caller and the callee. To
                 validate the nuisance model, we collected cell phone
                 call records of real-life people at our university and
                 computed the nuisance value for all voice calls. We
                 validated the nuisance levels using the feedback from
                 those real-life people. Such a nuisance model is useful
                 for predicting unwanted voice and video sessions in an
                 IP communication network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "behavior; Multimedia communications; nuisance;
                 presence; security; tolerance; unwantedness",
}

@Article{Zheng:2008:CVP,
  author =       "Qing-Fang Zheng and Wen Gao",
  title =        "Constructing visual phrases for effective and
                 efficient object-based image retrieval",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "7:1--7:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404887",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The explosion of multimedia data necessitates
                 effective and efficient ways for us to get access to
                 our desired ones. In this article, we draw an analogy
                 between image retrieval and text retrieval and propose
                 a visual phrase-based approach to retrieve images
                 containing desired objects (object-based image
                 retrieval). The visual phrase is defined as a pair of
                 frequently co-occurred adjacent local image patches and
                 is constructed using data mining. We design methods on
                 how to construct visual phrase and how to index/search
                 images based on visual phrase. We demonstrate
                 experiments to show our visual phrase-based approach
                 can be very efficient and more effective than current
                 visual word-based approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Content-based image retrieval; inverted index; local
                 image descriptor; object-based image retrieval; SIFT;
                 visual phrase",
}

@Article{Gill:2008:SDM,
  author =       "Phillipa Gill and Liqi Shi and Anirban Mahanti and
                 Zongpeng Li and Derek L. Eager",
  title =        "Scalable on-demand media streaming for heterogeneous
                 clients",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "8:1--8:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404888",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Periodic broadcast protocols enable efficient
                 streaming of highly popular media files to large
                 numbers of concurrent clients. Most previous periodic
                 broadcast protocols, however, assume that all clients
                 can receive at the same rate, and also assume that
                 reception bandwidth is not time-varying. In this
                 article, we first develop a new periodic broadcast
                 protocol, Optimized Heterogeneous Periodic Broadcast
                 (OHPB), that can be optimized for a given population of
                 clients with heterogeneous reception bandwidths and
                 quality-of-service requirements. The OHPB protocol
                 utilizes an optimized segment size progression
                 determined by solving a linear optimization model that
                 takes as input the client population characteristics
                 and an objective function such as mean client startup
                 delay. We then develop a generalization of the OHPB
                 linear optimization model that allows optimal server
                 bandwidth allocation among multiple concurrent OHPB
                 broadcasts, wherein each media file and its clients may
                 have different characteristics. Finally, we propose
                 complementary client protocols employing work-ahead
                 buffering of data during playback, so as to enable more
                 uniform playback quality when the reception bandwidth
                 is time-varying.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "linear programming; periodic broadcasts;
                 quality-of-service; Scalable streaming",
}

@Article{Jung:2008:SSL,
  author =       "Dawoon Jung and Jaegeuk Kim and Jin-Soo Kim and
                 Joonwon Lee",
  title =        "{ScaleFFS}: a scalable log-structured flash file
                 system for mobile multimedia systems",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "1",
  pages =        "9:1--9:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1404880.1404889",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:51:49 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "NAND flash memory has become one of the most popular
                 storage media for mobile multimedia systems. A key
                 issue in designing storage systems for mobile
                 multimedia systems is handling large-capacity storage
                 media and numerous large files with limited resources
                 such as memory. However, existing flash file systems,
                 including JFFS2 and YAFFS in particular, exhibit many
                 limitations in addressing the storage capacity of
                 mobile multimedia systems.\par

                 In this article, we design and implement a scalable
                 flash file system, called ScaleFFS, for mobile
                 multimedia systems. ScaleFFS is designed to require
                 only a small fixed amount of memory space and to
                 provide fast mount time, even if the file system size
                 grows to more than tens of gigabytes. The measurement
                 results show that ScaleFFS can be instantly mounted
                 regardless of the file system size, while achieving the
                 same write bandwidth and up to 22\% higher read
                 bandwidth compared to JFFS2.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "File system; flash memory; NAND; storage system",
}

@Article{Moncrieff:2008:DPA,
  author =       "Simon Moncrieff and Svetha Venkatesh and Geoff West",
  title =        "Dynamic privacy assessment in a smart house
                 environment using multimodal sensing",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413863",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Surveillance applications in private environments such
                 as smart houses require a privacy management policy if
                 such systems are to be accepted by the occupants of the
                 environment. This is due to the invasive nature of
                 surveillance, and the private nature of the home. In
                 this article, we propose a framework for dynamically
                 altering the privacy policy applied to the monitoring
                 of a smart house based on the situation within the
                 environment. Initially the situation, or context,
                 within the environment is determined; we identify
                 several factors for determining environmental context,
                 and propose methods to quantify the context using audio
                 and binary sensor data. The context is then mapped to
                 an appropriate privacy policy, which is implemented by
                 applying data hiding techniques to control access to
                 data gathered from various information sources. The
                 significance of this work lies in the examination of
                 privacy issues related to assisted-living smart house
                 environments. A single privacy policy in such
                 applications would be either too restrictive for an
                 observer, for example, a carer, or too invasive for the
                 occupants. We address this by proposing a dynamic
                 method, with the aim of decreasing the invasiveness of
                 the technology, while retaining the purpose of the
                 system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Assisted living; audio; context aware; privacy;
                 surveillance and monitoring",
}

@Article{Adams:2008:SUS,
  author =       "Brett Adams and Dinh Phung and Svetha Venkatesh",
  title =        "Sensing and using social context",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413864",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We present online algorithms to extract social
                 context: Social spheres are labeled locations of
                 significance, represented as convex hulls extracted
                 from GPS traces. Colocation is determined from
                 Bluetooth and GPS to extract social rhythms, patterns
                 in time, duration, place, and people corresponding to
                 real-world activities. Social ties are formulated from
                 proximity and shared spheres and rhythms. Quantitative
                 evaluation is performed for 10+ million samples over 45
                 man-months. Applications are presented with assessment
                 of perceived utility: {\em Socio-Graph}, a video and
                 photo browser with filters for social metadata, and
                 {\em Jive}, a blog browser that uses rhythms to
                 discover similarity between entries automatically.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Multimedia browsing; social context",
}

@Article{Mohanty:2008:IWB,
  author =       "Saraju P. Mohanty and Bharat K. Bhargava",
  title =        "Invisible watermarking based on creation and robust
                 insertion-extraction of image adaptive watermarks",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413865",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel invisible robust
                 watermarking scheme for embedding and extracting a
                 digital watermark in an image. The novelty lies in
                 determining a perceptually important subimage in the
                 host image. Invisible insertion of the watermark is
                 performed in the most significant region of the host
                 image such that tampering of that portion with an
                 intention to remove or destroy will degrade the
                 esthetic quality and value of the image. One feature of
                 the algorithm is that this subimage is used as a region
                 of interest for the watermarking process and eliminates
                 the chance of watermark removal. Another feature of the
                 algorithm is the creation of a compound watermark using
                 the input user watermark (logo) and attributes of the
                 host image. This facilitates the homogeneous fusion of
                 a watermark with the cover image, preserves the quality
                 of the host image, and allows robust
                 insertion-extraction. Watermark creation consists of
                 two distinct phases. During the first phase, a
                 statistical image is synthesized from a perceptually
                 important subimage of the image. A compound watermark
                 is created by embedding a watermark (logo) into the
                 statistical synthetic image by using a visible
                 watermarking technique. This compound watermark is
                 invisibly embedded into the important block of the host
                 image. The authentication process involves extraction
                 of the perceptive logo as well statistical testing for
                 two-layer evidence. Results of the experimentation
                 using standard benchmarks demonstrates the robustness
                 and efficacy of the proposed watermarking approach.
                 Ownership proof could be established under various
                 hostile attacks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "content protection; copyright protection; image;
                 invisible watermarking; Watermarking",
}

@Article{Yiu:2008:ODC,
  author =       "Wai-Pun Ken Yiu and Shueng-Han Gary Chan",
  title =        "Offering data confidentiality for multimedia overlay
                 multicast: {Design} and analysis",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413866",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Application layer multicast (ALM) has been proposed to
                 overcome current limitations in IP multicast for
                 large-group multimedia communication. We address
                 offering data confidentiality tailored for ALM. To
                 achieve confidentiality, a node may need to
                 continuously {\em re-encrypt\/} packets before
                 forwarding them downstream. Furthermore, keys have to
                 be changed whenever there is a membership change,
                 leading to {\em rekey\/} processing overhead at the
                 nodes. For a large and dynamic group, these
                 reencryption and rekeying operations incur high
                 processing overhead at the nodes. We propose and
                 analyze a scalable scheme called Secure Overlay
                 Multicast (SOM) which clusters ALM peers so as to
                 localize rekeying within a cluster and to limit
                 re-encryption at cluster boundaries, thereby minimizing
                 the total nodal processing overhead. We describe the
                 operations of SOM and compare its nodal processing
                 overhead with two other basic approaches, namely,
                 host-to-host encryption and whole group encryption. We
                 also present a simplified analytic model for SOM and
                 show that there exists an optimal cluster size to
                 minimize the total nodal processing overhead. By
                 comparing with a recently proposed ALM scheme (DT
                 protocol), SOM achieves a substantial reduction in
                 nodal processing overhead with similar network
                 performance in terms of network stress and delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Key management; multicast security; overlay multicast;
                 performance analysis",
}

@Article{Nakayama:2008:ECR,
  author =       "Minoru Nakayama and Yosiyuki Takahasi",
  title =        "Estimation of certainty for responses to
                 multiple-choice questionnaires using eye movements",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413867",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "To examine the feasibility of estimating the degree of
                 strength of belief (SOB) of responses using eye
                 movements, the scan paths of eye movements were
                 analyzed while subjects reviewed their own responses to
                 multiple choice tasks. All fixation points of eye
                 movements were classified into visual areas, or cells,
                 which corresponded with the positions of answers. Two
                 estimation procedures are proposed using eye-movement
                 data. The first one is identifying SOB using scan-path
                 transitions. By comparing subject's reports of high and
                 low SOB and eye-movement estimations, a significant
                 correct rate of discrimination of SOB was observed.
                 When the threshold of discrimination was controlled, a
                 high rate of correct responses was obtained if it was
                 set at a low level.\par

                 The second procedure is conducting SOB discrimination
                 using support vector machines (SVM) trained with
                 features of fixations. Subject's gazing features were
                 analyzed while they reviewed their own responses. A
                 discrimination model for SOB was trained with several
                 combinations of features to see whether performance of
                 a significant level could be obtained. As a result, a
                 trained model with 3 features (which consist of
                 interval time, vertical difference, and length between
                 fixations) can provide significant discrimination
                 performance for SOB.\par

                 These results provide evidence that strength of belief
                 can be estimated using eye movements",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "certainty; Eye-movements; scan-path analysis; support
                 vector machines",
}

@Article{Shipman:2008:AVG,
  author =       "Frank Shipman and Andreas Girgensohn and Lynn Wilcox",
  title =        "Authoring, viewing, and generating hypervideo: an
                 overview of {Hyper-Hitchcock}",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413868",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Hyper-Hitchcock consists of three components for
                 creating and viewing a form of interactive video called
                 detail-on-demand video: a hypervideo editor, a
                 hypervideo player, and algorithms for automatically
                 generating hypervideo summaries. Detail-on-demand video
                 is a form of hypervideo that supports one hyperlink at
                 a time for navigating between video sequences. The
                 Hyper-Hitchcock editor enables authoring of
                 detail-on-demand video without programming and uses
                 video processing to aid in the authoring process. The
                 Hyper-Hitchcock player uses labels and keyframes to
                 support navigation through and back hyperlinks.
                 Hyper-Hitchcock includes techniques for automatically
                 generating hypervideo summaries of one or more videos
                 that take the form of multiple linear summaries of
                 different lengths with links from the shorter to the
                 longer summaries. User studies on authoring and viewing
                 provided insight into the various roles of links in
                 hypervideo and found that player interface design
                 greatly affects people's understanding of hypervideo
                 structure and the video they access.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Hypervideo; link generation; video editing; video
                 summarization",
}

@Article{He:2008:EED,
  author =       "Wenbo He and Klara Nahrstedt and Xue Liu",
  title =        "End-to-end delay control of multimedia applications
                 over multihop wireless links",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413869",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The proliferation of multimedia applications over
                 mobile, resource-constrained wireless networks has
                 raised the need for techniques that adapt these
                 applications both to clients' Quality of Service (QoS)
                 requirements and to network resource constraints. This
                 article investigates the upper-layer adaptation
                 mechanisms to achieve end-to-end delay control for
                 multimedia applications. The proposed adaptation
                 approach spans application layer, middleware layer and
                 network layer. In application layer, the requirement
                 adaptor dynamically changes the requirement levels
                 according to end-to-end delay measurement and
                 acceptable QoS requirements for the end-users. In
                 middleware layer, the priority adaptor is used to
                 dynamically adjust the service classes for applications
                 using feedback control theory. In network layer, the
                 service differentiation scheduler assigns different
                 network resources (e.g., bandwidth) to different
                 service classes. With the coordination of these three
                 layers, our approach can adaptively assign resources to
                 multimedia applications. To evaluate the impact of our
                 adaptation scheme, we built a real IEEE 802.11 ad hoc
                 network testbed. The test-bed experiments show that the
                 proposed upper-layer adaptation for end-to-end delay
                 control successfully adjusts multimedia applications to
                 meet delay requirements in many scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "End-to-end delay QoS; wireless ad hoc networks",
}

@Article{Pan:2008:CBM,
  author =       "Leon Pan and Chang N. Zhang",
  title =        "A criterion-based multilayer access control approach
                 for multimedia applications and the implementation
                 considerations",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "2",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1413862.1413870",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:17 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, a novel criterion-based multilayer
                 access control (CBMAC) approach is presented to enhance
                 existing access control models such as Role-Based,
                 Mandatory, and Discretionary Access Control models to
                 support multilayer (multilevel) access control. The
                 proposed approach is based on a set of predefined
                 security criteria which are extracted from
                 authorization rules. The security attributes of objects
                 and users are specified by security criterion
                 expressions (serving as locks) and the elements
                 (serving as keys) of security criterion subsets
                 respectively. An object embedded with a number of
                 security criterion expressions becomes a secure object
                 while a user associated with a security criterion
                 subset is called a secure user. The multilayer access
                 control is achieved by evaluating the embedded security
                 criterion expressions (actuating locks) by the elements
                 (keys) in a user's security criterion subset. The paper
                 also provides the details of integrating the proposed
                 approach with existing access control models and
                 presents the implementation considerations of
                 Criterion-Based Role-Based Multilayer Access Control,
                 the integration of CBMAC and Role-Based Access
                 Control.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Multilayer access control; secure object; secure
                 permission; secure user; security criterion",
}

@Article{Candan:2009:ISS,
  author =       "K. Sel{\c{c}}uk Candan and Alberto {Del Bimbo} and
                 Carsten Griwodz and Alejandro Jaimes",
  title =        "Introduction to the special section for the best
                 papers of {ACM Multimedia 2008}",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556135",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cesar:2009:FTE,
  author =       "Pablo Cesar and Dick C. A. Bulterman and Jack Jansen
                 and David Geerts and Hendrik Knoche and William
                 Seager",
  title =        "Fragment, tag, enrich, and send: {Enhancing} social
                 sharing of video",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556136",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The migration of media consumption to personal
                 computers retains distributed social viewing, but only
                 via nonsocial, strictly personal interfaces. This
                 article presents an architecture, and implementation
                 for media sharing that allows for enhanced social
                 interactions among users. Using a mixed-device model,
                 our work allows targeted, personalized enrichment of
                 content. All recipients see common content, while
                 differentiated content is delivered to individuals via
                 their personal secondary screens. We describe the
                 goals, architecture, and implementation of our system
                 in this article. In order to validate our results, we
                 also present results from two user studies involving
                 disjoint sets of test participants.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Asynchronous media sharing; differentiated content
                 enrichment; secondary screens",
}

@Article{Knoche:2009:BPS,
  author =       "H. Knoche and M. A. Sasse",
  title =        "The big picture on small screens delivering acceptable
                 video quality in mobile {TV}",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556137",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mobile TV viewers can change the viewing distance and
                 (on some devices) scale the picture to their preferred
                 viewing ratio, trading off size for angular resolution.
                 We investigated optimal trade-offs between size and
                 resolution through a series of studies. Participants
                 selected their preferred size and rated the
                 acceptability of the visual experience on a 200ppi
                 device at a 4:3 aspect ratio. They preferred viewing
                 ratios similar to living room TV setups regardless of
                 the much lower resolution: at a minimum 14 pixels per
                 degree. While traveling on trains people required
                 videos with a height larger than 35mm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Mobile multimedia consumption; resolution; size;
                 trade-off",
}

@Article{Mondet:2009:CPP,
  author =       "Sebastien Mondet and Wei Cheng and Geraldine Morin and
                 Romulus Grigoras and Frederic Boudon and Wei Tsang
                 Ooi",
  title =        "Compact and progressive plant models for streaming in
                 networked virtual environments",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556138",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Just as in the real world, plants are important
                 objects in virtual worlds for creating pleasant and
                 realistic environments, especially those involving
                 natural scenes. As such, much effort has been made in
                 realistic modeling of plants. As the trend moves
                 towards networked and distributed virtual environments,
                 however, the current models are inadequate as they are
                 not designed for progressive transmissions. In this
                 article, we fill in this gap by proposing a progressive
                 representation for plants based on generalized
                 cylinders. We model the shape and thickness of branches
                 in a plant as B{\'e}zier curves, group the curves
                 according to the similarity, and differentially code
                 the curves to represent the plant in a compact and
                 progressive manner. To facilitate the transmission of
                 the plants, we quantify the visual contribution of each
                 branch and use this weight in packet scheduling. We
                 show the efficiency of our representations and the
                 effectiveness of our packet scheduler through
                 experiments over a wide area network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "multiresolution; networked virtual environment; plant
                 models; progressive coding; progressive transmission;
                 Streaming",
}

@Article{Wei:2009:CCM,
  author =       "Yong Wei and Suchendra M. Bhandarkar and Kang Li",
  title =        "Client-centered multimedia content adaptation",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556139",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The design and implementation of a client-centered
                 multimedia content adaptation system suitable for a
                 mobile environment comprising of resource-constrained
                 handheld devices or clients is described. The primary
                 contributions of this work are: (1) the overall
                 architecture of the client-centered content adaptation
                 system, (2) a data-driven multi-level Hidden Markov
                 model (HMM)-based approach to perform both video
                 segmentation and video indexing in a single pass, and
                 (3) the formulation and implementation of a
                 Multiple-choice Multidimensional Knapsack Problem
                 (MMKP)-based video personalization strategy. In order
                 to segment and index video data, a video stream is
                 modeled at both the semantic unit level and video
                 program level. These models are learned entirely from
                 training data and no domain-dependent knowledge about
                 the structure of video programs is used. This makes the
                 system capable of handling various kinds of videos
                 without having to manually redefine the program model.
                 The proposed MMKP-based personalization strategy is
                 shown to include more relevant video content in
                 response to the client's request than the existing 0/1
                 knapsack problem and fractional knapsack problem-based
                 strategies, and is capable of satisfying multiple
                 client-side constraints simultaneously. Experimental
                 results on CNN news videos and Major League Soccer
                 (MLS) videos are presented and analyzed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "hidden Markov models; multiple choice multidimensional
                 knapsack problem; video indexing; Video
                 personalization",
}

@Article{Sivaram:2009:DMS,
  author =       "G. S. V. S. Sivaram and Mohan S. Kankanhalli and K. R.
                 Ramakrishnan",
  title =        "Design of multimedia surveillance systems",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556140",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article addresses the problem of how to select
                 the optimal combination of sensors and how to determine
                 their optimal placement in a surveillance region in
                 order to meet the given performance requirements at a
                 minimal cost for a multimedia surveillance system. We
                 propose to solve this problem by obtaining a
                 performance vector, with its elements representing the
                 performances of subtasks, for a given input combination
                 of sensors and their placement. Then we show that the
                 optimal sensor selection problem can be converted into
                 the form of Integer Linear Programming problem (ILP) by
                 using a linear model for computing the optimal
                 performance vector corresponding to a sensor
                 combination. Optimal performance vector corresponding
                 to a sensor combination refers to the performance
                 vector corresponding to the optimal placement of a
                 sensor combination. To demonstrate the utility of our
                 technique, we design and build a surveillance system
                 consisting of PTZ (Pan-Tilt-Zoom) cameras and active
                 motion sensors for capturing faces. Finally, we show
                 experimentally that optimal placement of sensors based
                 on the design maximizes the system performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Performance vector; sensor selection and placement",
}

@Article{Liu:2009:SSE,
  author =       "Xiaotao Liu and Mark Corner and Prashant Shenoy",
  title =        "{\em {SEVA\/}}: {Sensor-enhanced} video annotation",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556141",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we study how a sensor-rich world can
                 be exploited by digital recording devices such as
                 cameras and camcorders to improve a user's ability to
                 search through a large repository of image and video
                 files. We design and implement a digital recording
                 system that records identities and locations of objects
                 (as advertised by their sensors) along with visual
                 images (as recorded by a camera). The process, which we
                 refer to as {\em Sensor-Enhanced Video Annotation
                 (SEVA)}, combines a series of correlation,
                 interpolation, and extrapolation techniques. It
                 produces a tagged stream that later can be used to
                 efficiently search for videos or frames containing
                 particular objects or people. We present detailed
                 experiments with a prototype of our system using both
                 stationary and mobile objects as well as GPS and
                 ultrasound. Our experiments show that: (i) SEVA has
                 zero error rates for static objects, except very close
                 to the boundary of the viewable area; (ii) for moving
                 objects or a moving camera, SEVA only misses objects
                 leaving or entering the viewable area by 1--2 frames;
                 (iii) SEVA can scale to 10 fast-moving objects using
                 current sensor technology; and (iv) SEVA runs online
                 using relatively inexpensive hardware.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "context-based retrieval; location-based services;
                 sensor-enhanced; Video annotation",
}

@Article{Wang:2009:MLS,
  author =       "Bing Wang and Wei Wei and Zheng Guo and Don Towsley",
  title =        "Multipath live streaming via {TCP}: {Scheme},
                 performance and benefits",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556142",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Motivated by the wide use of TCP for multimedia
                 streaming in practice and the increasing availability
                 of multipath between end hosts, we study multipath live
                 streaming via TCP in this article. We first design a
                 simple and practical TCP-based multipath streaming
                 scheme, named {\em Dynamic MPath-streaming
                 (DMP-streaming)}, which dynamically distributes packets
                 over multiple paths by {\em implicitly inferring\/} the
                 available bandwidths on these paths. To allow
                 systematic performance study, we develop an analytical
                 model for DMP-streaming and validate the model using
                 extensive {\em ns\/} simulation and Internet
                 experiments. We explore the parameter space of this
                 model and find that DMP-streaming generally provides
                 satisfactory performance when the aggregate achievable
                 TCP throughput is 1.6 times the video bitrate, when
                 allowing a few seconds of startup delay. Last, we
                 comment on the benefits of using multipath versus
                 single path for TCP-based streaming.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "multimedia streaming; Performance modeling",
}

@Article{Li:2009:PBR,
  author =       "Mingzhe Li and Mark Claypool and Robert Kinicki",
  title =        "Playout buffer and rate optimization for streaming
                 over {IEEE 802.11} wireless networks",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "3",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1556134.1556143",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:52:39 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Most streaming rate selection and buffer optimization
                 algorithms are developed for wired networks and can
                 perform poorly over wireless networks. Wireless MAC
                 layer behavior, such as rate adaptation,
                 retransmissions, and medium sharing, can significantly
                 degrade the effectiveness of current streaming
                 algorithms. This article presents the Buffer and Rate
                 Optimization for Streaming (BROS) algorithm to improve
                 streaming performance. BROS uses a bandwidth estimation
                 tool designed specifically for wireless networks and
                 models the relationship between buffer size, streaming
                 data rate, and available bandwidth distribution. BROS
                 optimizes the streaming data rate and initial buffer
                 size, resulting in a high data rate but with few frame
                 losses and buffer underflow events, while still keeping
                 a small initial buffer delay. BROS is implemented in
                 the Emulated Streaming (EmuS) client-server system and
                 evaluated on an IEEE 802.11 wireless testbed with
                 various wireless conditions. The evaluation shows that
                 BROS can effectively optimize the streaming rate and
                 initial buffer size based on wireless network bandwidth
                 conditions, thus achieving better performance than
                 static rate or buffer selection and jitter removal
                 buffers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Multimedia networking; playout buffer; streaming rate;
                 wireless networks",
}

@Article{Sauer:2009:MDC,
  author =       "Danielle Sauer and Yee-Hong Yang",
  title =        "Music-driven character animation",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "4",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1596990.1596991",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Music-driven character animation extracts musical
                 features from a song and uses them to create an
                 animation. This article presents a system that builds a
                 new animation directly from musical attributes, rather
                 than simply synchronizing it to the music like similar
                 systems. Using a simple script that identifies the
                 movements involved in the performance and their timing,
                 the user can easily control the animation of
                 characters. Another unique feature of the system is its
                 ability to incorporate multiple characters into the
                 same animation, both with synchronized and
                 unsynchronized movements. A system that integrates
                 Celtic dance movements is developed in this article. An
                 evaluation of the results shows that the majority of
                 animations are found to be appealing to viewers and
                 that altering the music can change the attractiveness
                 of the final result.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Character animation; motion synthesis; music analysis;
                 primitive movements",
}

@Article{Deng:2009:SCA,
  author =       "Robert H. Deng and Yanjiang Yang",
  title =        "A study of content authentication in proxy-enabled
                 multimedia delivery systems: {Model}, techniques, and
                 applications",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "4",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1596990.1596992",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Compared with the direct server-user approach, the
                 server-proxy-user architecture for multimedia delivery
                 promises significantly improved system scalability. The
                 introduction of the intermediary transcoding proxies
                 between content servers and end users in this
                 architecture, however, brings unprecedented challenges
                 to content security. In this article, we present a
                 systematic study on the end-to-end content
                 authentication problem in the server-proxy-user
                 context, where intermediary proxies transcode
                 multimedia content dynamically. We present a formal
                 model for the authentication problem, propose a
                 concrete construction for authenticating generic data
                 modality and formally prove its security. We then apply
                 the generic construction to authenticating specific
                 multimedia formats, for example, JPEG2000 code-streams
                 and MPEG-4 video streams. The prototype implementation
                 shows that our scheme is suitable for practical
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "end-to-end authentication; Multimedia content
                 delivery; security",
}

@Article{Cha:2009:TVS,
  author =       "Jongeun Cha and Mohamad Eid and Abdulmotaleb {El
                 Saddik}",
  title =        "Touchable {$3$D} video system",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "4",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1596990.1596993",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Multimedia technologies are reaching the limits of
                 providing audio-visual media that viewers consume
                 passively. An important factor, which will ultimately
                 enhance the user's experience in terms of
                 impressiveness and immersion, is interaction. Among
                 daily life interactions, haptic interaction plays a
                 prominent role in enhancing the quality of experience
                 of users, and in promoting physical and emotional
                 development. Therefore, a critical step in multimedia
                 research is expected to bring the sense of touch, or
                 haptics, into multimedia systems and applications. This
                 article proposes a touchable 3D video system where
                 viewers can actively touch a video scene through a
                 force-feedback device, and presents the underlying
                 technologies in three functional components: (1)
                 contents generation, (2) contents transmission, and (3)
                 viewing and interaction. First of all, we introduce a
                 depth image-based haptic representation (DIBHR) method
                 that adds haptic and heightmap images, in addition to
                 the traditional depth image-based representation
                 (DIBR), to encode the haptic surface properties of the
                 video media. In this representation, the haptic image
                 contains the stiffness, static friction, and dynamic
                 friction, whereas the heightmap image contains
                 roughness of the video contents. Based on this
                 representation method, we discuss how to generate
                 synthetic and natural (real) video media through a 3D
                 modeling tool and a depth camera, respectively. Next,
                 we introduce a transmission mechanism based on the
                 MPEG-4 framework where new MPEG-4 BIFS nodes are
                 designed to describe the haptic scene. Finally, a
                 haptic rendering algorithm to compute the interaction
                 force between the scene and the viewer is described. As
                 a result, the performance of the haptic rendering
                 algorithm is evaluated in terms of computational time
                 and smooth contact force. It operates marginally within
                 a 1 kHz update rate that is required to provide stable
                 interaction force and provide smoother contact force
                 with the depth image that has high frequency
                 geometrical noise using a median filter.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "haptic rendering algorithm; Haptic surface properties;
                 video representation",
}

@Article{Benevenuto:2009:VIO,
  author =       "Fabr{\'\i}cio Benevenuto and Tiago Rodrigues and
                 Virgilio Almeida and Jussara Almeida and Keith Ross",
  title =        "Video interactions in online video social networks",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "4",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1596990.1596994",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article characterizes video-based interactions
                 that emerge from YouTube's video response feature,
                 which allows users to discuss themes and to provide
                 reviews for products or places using much richer media
                 than text. Based on crawled data covering a
                 representative subset of videos and users, we present a
                 characterization from two perspectives: the video
                 response view and the interaction network view. In
                 addition to providing valuable statistical models for
                 various characteristics, our study uncovers typical
                 user behavioral patterns in video-based environments
                 and shows evidence of opportunistic behavior.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "opportunistic behavior; promotion; social media;
                 social networks; video communication; Video
                 interactions; video spam; YouTube",
}

@Article{Erdmann:2009:IEB,
  author =       "Maike Erdmann and Kotaro Nakayama and Takahiro Hara
                 and Shojiro Nishio",
  title =        "Improving the extraction of bilingual terminology from
                 {Wikipedia}",
  journal =      j-TOMCCAP,
  volume =       "5",
  number =       "4",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1596990.1596995",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:03 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Research on the automatic construction of bilingual
                 dictionaries has achieved impressive results. Bilingual
                 dictionaries are usually constructed from parallel
                 corpora, but since these corpora are available only for
                 selected text domains and language pairs, the potential
                 of other resources is being explored as well.\par

                 In this article, we want to further pursue the idea of
                 using Wikipedia as a corpus for bilingual terminology
                 extraction. We propose a method that extracts
                 term-translation pairs from different types of
                 Wikipedia link information. After that, an SVM
                 classifier trained on the features of manually labeled
                 training data determines the correctness of unseen
                 term-translation pairs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Bilingual dictionary; link analysis; Wikipedia
                 mining",
}

@Article{Carlsson:2010:SSL,
  author =       "Niklas Carlsson and Derek L. Eager",
  title =        "Server selection in large-scale video-on-demand
                 systems",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671955",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video on demand, particularly with user-generated
                 content, is emerging as one of the most
                 bandwidth-intensive applications on the Internet. Owing
                 to content control and other issues, some
                 video-on-demand systems attempt to prevent downloading
                 and peer-to-peer content delivery. Instead, such
                 systems rely on server replication, such as via
                 third-party content distribution networks, to support
                 video streaming (or pseudostreaming) to their clients.
                 A major issue with such systems is the cost of the
                 required server resources.\par

                 By synchronizing the video streams for clients that
                 make closely spaced requests for the same video from
                 the same server, server costs (such as for retrieval of
                 the video data from disk) can be amortized over
                 multiple requests. A fundamental trade-off then arises,
                 however, with respect to server selection. Network
                 delivery cost is minimized by selecting the {\em
                 nearest\/} server, while server cost is minimized by
                 directing closely spaced requests for the same video to
                 a {\em common\/} server.\par

                 This article compares classes of server selection
                 policies within the context of a simple system model.
                 We conclude that: (i) server selection using dynamic
                 system state information (rather than only proximities
                 and average loads) can yield large improvements in
                 performance, (ii) deferring server selection for a
                 request as late as possible (i.e., until just before
                 streaming is to begin) can yield additional large
                 improvements, and (iii) within the class of policies
                 using dynamic state information and deferred selection,
                 policies using only ``local'' (rather than global)
                 request information are able to achieve most of the
                 potential performance gains.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "content distribution networks; modeling; Performance
                 analysis; server selection; video-on-demand",
}

@Article{Agarwal:2010:BRW,
  author =       "Parag Agarwal and Balakrishnan Prabhakaran",
  title =        "Blind robust watermarking of {$3$D} motion data",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671956",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The article addresses the problem of copyright
                 protection for 3D motion-captured data by designing a
                 robust blind watermarking mechanism. The mechanism
                 segments motion capture data and identifies clusters of
                 3D points per segment. A watermark can be embedded and
                 extracted within these clusters by using a proposed
                 extension of 3D quantization index modulation. The
                 watermarking scheme is blind in nature and the encoded
                 watermarks are shown to be imperceptible, and secure.
                 The resulting hiding capacity has bounds based on
                 cluster size. The watermarks are shown to be robust
                 against attacks such as uniform affine transformations
                 (scaling, rotation, and translation), cropping,
                 reordering, and noise addition. The time complexity for
                 watermark embedding and extraction is estimated as
                 O({\em n\/} log {\em n\/}) and O({\em n\/}$^2$ log {\em
                 n\/}), respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "blind; decoding; encoding; spatial; Watermarking",
}

@Article{Yang:2010:DMD,
  author =       "Bo Yang",
  title =        "{DSI}: a model for distributed multimedia semantic
                 indexing and content integration",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671957",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Considerable research has been done on the
                 content-based multimedia delivery and access in
                 distributed data repositories. As noted in the
                 literature, there is always a trade-off between
                 multimedia quality and access speed. In addition, the
                 overall performance is greatly determined by the
                 distribution of the multimedia data. In this article,
                 an unsupervised multimedia semantic integration
                 approach for a distributed infrastructure, the
                 Distributed Semantic Indexing (DSI), is presented that
                 addresses both the data quality and search performance.
                 With the ability of summarizing content information and
                 guiding data distribution, the proposed approach is
                 distinguished by: (1) logic-based representation and
                 concise abstraction of the semantic contents of
                 multimedia data, which are further integrated to form a
                 general overview of a multimedia data repository ---
                 content signature; (2) application of linguistic
                 relationships to construct a hierarchical metadata
                 based on the content signatures allowing imprecise
                 queries; and (3) achieving the optimal performance in
                 terms of search cost. The fundamental structure of the
                 proposed model is presented. The proposed scheme has
                 been simulated and the simulation results are analyzed
                 and compared against several other approaches that have
                 been advocated in the literature.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "distributed indexing; image retrieval; Semantic
                 representation",
}

@Article{Nystrom:2010:ECO,
  author =       "Marcus Nystr{\"o}m and Kenneth Holmqvist",
  title =        "Effect of compressed offline foveated video on viewing
                 behavior and subjective quality",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671958",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Offline foveation is a technique to improve the
                 compression efficiency of digitized video. The general
                 idea behind offline foveation is to blur video regions
                 where no or a small number of previewers look without
                 decreasing the subjective quality for later viewers. It
                 relies on the fact that peripheral vision is reduced
                 compared to central vision, and the observation that
                 during free-viewing humans' gaze positions generally
                 coincide when watching video. In this article, we
                 conduct two experiments to assess how offline foveation
                 affects viewing behavior and subjective quality. In the
                 first experiment, 15 subjects free-viewed six video
                 clips before and after offline foveation whereas in the
                 second experiment we had 17 subjects assessing the
                 quality of these videos after one, two, and three
                 consecutive viewings. Eye movements were measured
                 during the experiments. Results showed that, although
                 offline foveation prior to encoding with H.264 yielded
                 data reductions up to 52\% (20\% average) on the tested
                 videos, it had little or no effect on where people
                 looked, their intersubject dispersion, fixation
                 duration, saccade amplitude, or the experienced quality
                 during first-time viewing. However, seeing the videos
                 more than once increased the intersubject dispersion
                 and decreased the subjective quality. In view of these
                 results, we discuss the usage of offline foveated video
                 in practical applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Eye-tracking; foveation; subjective quality; video
                 compression",
}

@Article{Ivanov:2010:RTH,
  author =       "Yuri V. Ivanov and C. J. Bleakley",
  title =        "Real-time {H.264} video encoding in software with fast
                 mode decision and dynamic complexity control",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671959",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel real-time algorithm for
                 reducing and dynamically controlling the computational
                 complexity of an H.264 video encoder implemented in
                 software. A fast mode decision algorithm, based on a
                 Pareto-optimal macroblock classification scheme, is
                 combined with a dynamic complexity control algorithm
                 that adjusts the MB class decisions such that a
                 constant frame rate is achieved. The average coding
                 efficiency of the proposed algorithm was found to be
                 similar to that of conventional encoding operating at
                 half the frame rate. The proposed algorithm was found
                 to provide lower average bitrate and distortion than
                 static complexity scaling.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "complexity; complexity control; fast mode decision;
                 H/264/AVC; mode decision; rate distortion; real time",
}

@Article{Hefeeda:2010:ASM,
  author =       "Mohamed Hefeeda and Kianoosh Mokhtarian",
  title =        "Authentication schemes for multimedia streams:
                 {Quantitative} analysis and comparison",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671954.1671960",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Mar 16 18:53:23 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the rapid increase in the demand for multimedia
                 services, securing the delivery of multimedia content
                 has become an important issue. Accordingly, the problem
                 of multimedia stream authentication has received
                 considerable attention by previous research and various
                 solutions have been proposed. However, these solutions
                 have not been rigorously analyzed and contrasted to
                 each other, and thus their relative suitability for
                 different streaming environments is not clear. This
                 article presents comprehensive analysis and comparison
                 among different schemes proposed in the literature to
                 authenticate multimedia streams. Authentication schemes
                 for nonscalable and scalable multimedia streams are
                 analyzed. To conduct this analysis, we define five
                 important performance metrics, which are computation
                 cost, communication overhead, receiver buffer size,
                 delay, and tolerance to packet losses. We derive
                 analytic formulas for these metrics for all considered
                 authentication schemes to numerically analyze their
                 performance. In addition, we implement all schemes in a
                 simulator to study and compare their performance in
                 different environments. The parameters for the
                 simulator are carefully chosen to mimic realistic
                 settings. We draw several conclusions on the advantages
                 and disadvantages of each scheme. We extend our
                 analysis to authentication techniques for scalable
                 streams. We pay careful attention to the flexibility of
                 scalable streams and analyze its impacts on the
                 authentication schemes. Our analysis and comparison
                 reveal the merits and shortcomings of each scheme,
                 provide guidelines on choosing the most appropriate
                 scheme for a given multimedia streaming application,
                 and could stimulate designing new authentication
                 schemes or improving existing ones. For example, our
                 detailed analysis has led us to design a new
                 authentication scheme that combines the best features
                 of two previous schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "authentication schemes; Multimedia authentication;
                 multimedia security; multimedia streaming; scalable
                 coding; secure streaming",
}

@Article{Yang:2010:EMP,
  author =       "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and
                 Gregorij Kurillo and Ruzena Bajcsy",
  title =        "Enabling multi-party {$3$D} tele-immersive
                 environments with {{\em ViewCast}}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671962.1671963",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Aug 14 17:17:15 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Three-dimensional tele-immersive (3DTI) environments
                 have great potential to promote collaborative work
                 among geographically distributed users. However, most
                 existing 3DTI systems only work with two sites due to
                 the huge demand of resources and the lack of a simple
                 yet powerful networking model to handle connectivity,
                 scalability, and quality-of-service (QoS)
                 guarantees.\par

                 In this article, we explore the design space from the
                 angle of multi-stream management to enable multi-party
                 3DTI communication. Multiple correlated 3D video
                 streams are employed to provide a comprehensive
                 representation of the physical scene in each 3DTI
                 environment, and are rendered together to establish a
                 common cyberspace among all participating 3DTI
                 environments. The existence of multi-stream correlation
                 provides the unique opportunity for new approaches in
                 QoS provisioning. Previous work mostly concentrated on
                 compression and adaptation techniques on the per-stream
                 basis while ignoring the application layer semantics
                 and the coordination required among streams. We propose
                 an innovative and generalized {\em ViewCast\/} model to
                 coordinate the multi-stream content dissemination over
                 an overlay network. ViewCast leverages view semantics
                 in 3D free-viewpoint video systems to fill the gap
                 between high-level user interest and low-level stream
                 management. In ViewCast, only the view information is
                 specified by the user/application, while the underlying
                 control dynamically performs stream differentiation,
                 selection, coordination, and dissemination. We present
                 the details of ViewCast and evaluate it through both
                 simulation and 3DTI sessions among tele-immersive
                 environments residing in different institutes across
                 the Internet2. Our experimental results demonstrate the
                 implementation feasibility and performance enhancement
                 of ViewCast in supporting multi-party 3DTI
                 collaboration.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "3D tele-immersion; application level multicast;
                 distributed multimedia system; multi-stream
                 coordination; networking protocol; QoS adaptation",
}

@Article{Wu:2010:ELT,
  author =       "Junwen Wu and Mohan M. Trivedi",
  title =        "An eye localization, tracking and blink pattern
                 recognition system: {Algorithm} and evaluation",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671962.1671964",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Aug 14 17:17:15 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This study is to investigate the fundamental problems
                 of, (1) facial feature detection and localization,
                 especially eye features; and (2) eye dynamics,
                 including tracking and blink detection. We first
                 describe our contribution to eye localization.
                 Following that, we discuss a simultaneous eye tracking
                 and blink detection system. Facial feature detection is
                 solved in a general object detection framework and its
                 performance for eye localization is presented. A binary
                 tree representation based on feature dependency
                 partitions the object feature space in a coarse to fine
                 manner. In each compact feature subspace, independent
                 component analysis (ICA) is used to get the independent
                 sources, whose probability density functions (PDFs) are
                 modeled by Gaussian mixtures. When applying this
                 representation for the task of eye detection, a
                 subwindow is used to scan the entire image and each
                 obtained image patch is examined using Bayesian
                 criteria to determine the presence of an eye subject.
                 After the eyes are automatically located with binary
                 tree-based probability learning, interactive particle
                 filters are used for simultaneously tracking the eyes
                 and detecting the blinks. The particle filters use
                 classification-based observation models, in which the
                 posterior probabilities are evaluated by logistic
                 regressions in tensor subspaces. Extensive experiments
                 are used to evaluate the performance from two aspects,
                 (1) blink detection rate and the accuracy of blink
                 duration in terms of the frame numbers; (2) eye
                 tracking accuracy. We also present an experimental
                 setup for obtaining the benchmark data in tracking
                 accuracy evaluation. The experimental evaluation
                 demonstrates the capability of this approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Eye blink detection; human computer interface;
                 particle filtering; video processing",
}

@Article{Jin:2010:DMN,
  author =       "Xing Jin and S.-H. Gary Chan",
  title =        "Detecting malicious nodes in peer-to-peer streaming by
                 peer-based monitoring",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671962.1671965",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Aug 14 17:17:15 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Current peer-to-peer (P2P) streaming systems often
                 assume that nodes cooperate to upload and download
                 data. However, in the open environment of the Internet,
                 this is not necessarily true and there exist malicious
                 nodes in the system. In this article, we study
                 malicious actions of nodes that can be detected through
                 peer-based monitoring. We require each node to monitor
                 the data received and to periodically send monitoring
                 messages about its neighbors to some trustworthy nodes.
                 To efficiently store and search messages among multiple
                 trustworthy nodes, we organize trustworthy nodes into a
                 threaded binary tree. Trustworthy nodes also
                 dynamically redistribute monitoring messages among
                 themselves to achieve load balancing. Our simulation
                 results show that this scheme can efficiently detect
                 malicious nodes with high accuracy, and that the
                 dynamic redistribution method can achieve good load
                 balancing among trustworthy nodes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Malicious nodes; peer monitoring; peer-to-peer
                 streaming",
}

@Article{Chiu:2010:FMH,
  author =       "Chih-Yi Chiu and Hsin-Min Wang and Chu-Song Chen",
  title =        "Fast min-hashing indexing and robust spatio-temporal
                 matching for detecting video copies",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "2",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671962.1671966",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Aug 14 17:17:15 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The increase in the number of video copies, both legal
                 and illegal, has become a major problem in the
                 multimedia and Internet era. In this article, we
                 propose a novel method for detecting various video
                 copies in a video sequence. To achieve fast and robust
                 detection, the method fully integrates several
                 components, namely the min-hashing signature to
                 compactly represent a video sequence, a spatio-temporal
                 matching scheme to accurately evaluate video similarity
                 compiled from the spatial and temporal aspects, and
                 some speedup techniques to expedite both min-hashing
                 indexing and spatio-temporal matching. The results of
                 experiments demonstrate that, compared to several
                 baseline methods with different feature descriptors and
                 matching schemes, the proposed method which combines
                 both global and local feature descriptors yields the
                 best performance when encountering a variety of video
                 transformations. The method is very fast, requiring
                 approximately 0.06 seconds to search for copies of a
                 thirty-second video clip in a six-hour video
                 sequence.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Content-based copy detection; histogram pruning;
                 near-duplicate",
}

@Article{Sarhan:2010:WTP,
  author =       "Nabil J. Sarhan and Mohammad A. Alsmirat and Musab
                 Al-Hadrusi",
  title =        "Waiting-time prediction in scalable on-demand video
                 streaming",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "2",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1671962.1671967",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Sat Aug 14 17:17:15 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Providing video streaming users with expected waiting
                 times enhances their perceived quality-of-service (QoS)
                 and encourages them to wait. In the absence of any
                 waiting-time feedback, users are more likely to defect
                 because of the uncertainty as to when their services
                 will start. We analyze waiting-time predictability in
                 scalable video streaming. We propose two prediction
                 schemes and study their effectiveness when applied with
                 various stream merging techniques and scheduling
                 policies. The results demonstrate that the waiting time
                 can be predicted accurately, especially when enhanced
                 cost-based scheduling is applied. The combination of
                 waiting-time prediction and cost-based scheduling leads
                 to outstanding performance benefits.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
  keywords =     "Scheduling; stream merging; time-of-service
                 guarantees; video streaming; waiting-time prediction",
}

@Article{Xu:2010:IBP,
  author =       "Changsheng Xu and Eckehard Steinbach and Abdulmotaleb
                 El Saddik and Michelle Zhou",
  title =        "Introduction to the best papers of {ACM Multimedia
                 2009}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1830482",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zha:2010:VQS,
  author =       "Zheng-Jun Zha and Linjun Yang and Tao Mei and Meng
                 Wang and Zengfu Wang and Tat-Seng Chua and Xian-Sheng
                 Hua",
  title =        "Visual query suggestion: {Towards} capturing user
                 intent in {Internet} image search",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823747",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jiang:2010:AVA,
  author =       "Wei Jiang and Courtenay Cotton and Shih-Fu Chang and
                 Dan Ellis and Alexander C. Loui",
  title =        "Audio-visual atoms for generic video concept
                 classification",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823748",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{DeOliveira:2010:LND,
  author =       "Rodrigo {De Oliveira} and Mauro Cherubini and Nuria
                 Oliver",
  title =        "Looking at near-duplicate videos from a human-centric
                 perspective",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823749",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yin:2010:LEC,
  author =       "Hao Yin and Xuening Liu and Tongyu Zhan and Vyas Sekar
                 and Feng Qiu and Chuang Lin and Hui Zhang and Bo Li",
  title =        "{LiveSky}: {Enhancing} {CDN} with {P2P}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823750",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Money:2010:EEL,
  author =       "Arthur G. Money and Harry Agius",
  title =        "{ELVIS}: {Entertainment-Led VIdeo Summaries}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823751",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hoi:2010:SSD,
  author =       "Steven C. h. Hoi and Wei Liu and Shih-Fu Chang",
  title =        "Semi-supervised distance metric learning for
                 collaborative image retrieval and clustering",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823752",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Maddage:2010:WLA,
  author =       "Namunu C. Maddage and Khe Chai Sim and Haizhou Li",
  title =        "Word level automatic alignment of music and lyrics
                 using vocal synthesis",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823753",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Qudah:2010:EDD,
  author =       "Bashar Qudah and Nabil J. Sarhan",
  title =        "Efficient delivery of on-demand video streams to
                 heterogeneous receivers",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823754",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gomes:2010:STA,
  author =       "Jo{\~a}o V. P. Gomes and Pedro R. M. In{\'a}cio and
                 Branka Lakic and M{\'a}rio M. Freire and Henrique J. A.
                 Da Silva and Paulo P. Monteiro",
  title =        "Source traffic analysis",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1823755",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Boll:2010:CPA,
  author =       "Susanne Boll and Jiebo Luo and Ramesh Jain and Dong
                 Xu",
  title =        "Call for papers: {ACM Transactions on Multimedia
                 Computing, Communications and Applications} special
                 issue on social media",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1823746.1837254",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2010:OOD,
  author =       "Ralf Steinmetz",
  title =        "Obituary to our dear friend {Professor Dr. Nicolas D.
                 Georganas, PhD}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "23:1--23:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865107",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Haenselmann:2010:FSI,
  author =       "Thomas Haenselmann",
  title =        "Foreword to the special issue on multimedia sensor
                 fusion",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "24:1--24:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865108",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2010:MBA,
  author =       "Xiangyu Wang and Mohan Kankanhalli",
  title =        "{MultiFusion}: a boosting approach for multimedia
                 fusion",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "25:1--25:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865109",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chetty:2010:MSF,
  author =       "Girija Chetty and Matthew White",
  title =        "Multimedia sensor fusion for retrieving identity in
                 biometric access control systems",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "26:1--26:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865110",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Friedland:2010:DAS,
  author =       "Gerald Friedland and Chuohao Yeo and Hayley Hung",
  title =        "Dialocalization: {Acoustic} speaker diarization and
                 visual localization as joint optimization problem",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "27:1--27:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865111",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rahman:2010:SGA,
  author =       "Abu Saleh Md Mahfujur Rahman and M. Anwar Hossain and
                 Abdulmotaleb El Saddik",
  title =        "Spatial-geometric approach to physical mobile
                 interaction based on accelerometer and {IR} sensory
                 data fusion",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "28:1--28:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865112",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2010:EMT,
  author =       "Zhenyu Yang and Wanmin Wu and Klara Nahrstedt and
                 Gregorij Kurillo and Ruzena Bajcsy",
  title =        "Enabling multiparty {$3$D} tele-immersive environments
                 with {ViewCast}",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "29:1--29:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865113",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Marshall:2010:OCM,
  author =       "Damien Marshall and S{\'e}amus Mcloone and Tom{\'a}s
                 Ward",
  title =        "Optimizing consistency by maximizing bandwidth usage
                 in distributed interactive applications",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "30:1--30:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865114",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Vu:2010:UOC,
  author =       "Long Vu and Indranil Gupta and Klara Nahrstedt and Jin
                 Liang",
  title =        "Understanding overlay characteristics of a large-scale
                 peer-to-peer {IPTV} system",
  journal =      j-TOMCCAP,
  volume =       "6",
  number =       "4",
  pages =        "31:1--31:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1865106.1865115",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Tue Nov 23 10:03:16 MST 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Meyer:2011:MRL,
  author =       "Marek Meyer and Christoph Rensing and Ralf Steinmetz",
  title =        "Multigranularity reuse of learning resources",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870122",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bouyakoub:2011:SBI,
  author =       "Samia Bouyakoub and Abdelkader Belkhir",
  title =        "{SMIL} builder: an incremental authoring tool for
                 {SMIL Documents}",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870123",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hossain:2011:MAQ,
  author =       "M. Anwar Hossain and Pradeep K. Atrey and Abdulmotaleb
                 El Saddik",
  title =        "Modeling and assessing quality of information in
                 multisensor multimedia monitoring systems",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870124",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhu:2011:NDK,
  author =       "Jianke Zhu and Steven C. H. Hoi and Michael R. Lyu and
                 Shuicheng Yan",
  title =        "Near-duplicate keyframe retrieval by semi-supervised
                 learning and nonrigid image matching",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870125",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hsu:2011:FCL,
  author =       "Cheng-Hsin Hsu and Mohamed Hefeeda",
  title =        "A framework for cross-layer optimization of video
                 streaming in wireless networks",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870126",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chandra:2011:EAS,
  author =       "Surendar Chandra and Xuwen Yu",
  title =        "An empirical analysis of serendipitous media sharing
                 among campus-wide wireless users",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870121.1870127",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:41 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gopinathan:2011:OLM,
  author =       "Ajay Gopinathan and Zongpeng Li",
  title =        "Optimal layered multicast",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925102",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hsu:2011:USS,
  author =       "Cheng-Hsin Hsu and Mohamed Hefeeda",
  title =        "Using simulcast and scalable video coding to
                 efficiently control channel switching delay in mobile
                 {TV} broadcast networks",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925103",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jin:2011:KDH,
  author =       "Yohan Jin and Balakrishnan Prabhakaran",
  title =        "Knowledge discovery from {$3$D} human motion streams
                 through semantic dimensional reduction",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925104",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheng:2011:MPM,
  author =       "Wei Cheng and Wei Tsang Ooi and Sebastien Mondet and
                 Romulus Grigoras and G{\'e}raldine Morin",
  title =        "Modeling progressive mesh streaming: {Does} data
                 dependency matter?",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925105",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bagchi:2011:FAD,
  author =       "Susmit Bagchi",
  title =        "A fuzzy algorithm for dynamically adaptive multimedia
                 streaming",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925106",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hsu:2011:SMV,
  author =       "Cheng-Hsin Hsu and Mohamed Hefeeda",
  title =        "Statistical multiplexing of variable-bit-rate videos
                 streamed to mobile devices",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1925101.1925107",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Wed Mar 16 09:25:42 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2011:EN,
  author =       "Ralf Steinmetz",
  title =        "Editorial notice",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000487",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Korshunov:2011:VQF,
  author =       "Pavel Korshunov and Wei Tsang Ooi",
  title =        "Video quality for face detection, recognition, and
                 tracking",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000488",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2011:PCI,
  author =       "Pei-Yu Lin and Jung-San Lee and Chin-Chen Chang",
  title =        "Protecting the content integrity of digital imagery
                 with fidelity preservation",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000489",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{VanLeuken:2011:SVO,
  author =       "Reinier H. {Van Leuken} and Remco C. Veltkamp",
  title =        "Selecting vantage objects for similarity indexing",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000490",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Feng:2011:SRI,
  author =       "Wu-Chi Feng and Thanh Dang and John Kassebaum and Tim
                 Bauman",
  title =        "Supporting region-of-interest cropping through
                 constrained compression",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000491",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2011:DBA,
  author =       "Qingzhong Liu and Andrew H. Sung and Mengyu Qiao",
  title =        "Derivative-based audio steganalysis",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000492",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2011:GDO,
  author =       "Frederick W. B. Li and Rynson W. H. Lau and Danny
                 Kilis and Lewis W. F. Li",
  title =        "Game-on-demand:: an online game engine based on
                 geometry streaming",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000486.2000493",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  bibdate =      "Mon Sep 5 17:00:22 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shirmohammadi:2011:IAM,
  author =       "Shervin Shirmohammadi and Jiebo Luo and Jie Yang and
                 Abdulmotaleb El Saddik",
  title =        "Introduction to {ACM Multimedia 2010} best paper
                 candidates",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "20:1--20:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037677",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bhattacharya:2011:HAA,
  author =       "Subhabrata Bhattacharya and Rahul Sukthankar and
                 Mubarak Shah",
  title =        "A holistic approach to aesthetic enhancement of
                 photographs",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "21:1--21:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037678",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tan:2011:URS,
  author =       "Shulong Tan and Jiajun Bu and Chun Chen and Bin Xu and
                 Can Wang and Xiaofei He",
  title =        "Using rich social media information for music
                 recommendation via hypergraph model",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "22:1--22:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037679",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Milani:2011:CAE,
  author =       "Simone Milani and Giancarlo Calvagno",
  title =        "A cognitive approach for effective coding and
                 transmission of {$3$D} video",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "23:1--23:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037680",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hong:2011:VAE,
  author =       "Richang Hong and Meng Wang and Xiao-Tong Yuan and
                 Mengdi Xu and Jianguo Jiang and Shuicheng Yan and
                 Tat-Seng Chua",
  title =        "Video accessibility enhancement for hearing-impaired
                 users",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "24:1--24:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037681",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Boll:2011:ISI,
  author =       "Susanne Boll and Ramesh Jain and Jiebo Luo and Dong
                 Xu",
  title =        "Introduction to special issue on social media",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "25:1--25:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037682",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2011:EOM,
  author =       "Yu-Ching Lin and Yi-Hsuan Yang and Homer H. Chen",
  title =        "Exploiting online music tags for music emotion
                 classification",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "26:1--26:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037683",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rabbath:2011:ACP,
  author =       "Mohamad Rabbath and Philipp Sandhaus and Susanne
                 Boll",
  title =        "Automatic creation of photo books from stories in
                 social media",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "27:1--27:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037684",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2011:RAI,
  author =       "Weiming Hu and Haiqiang Zuo and Ou Wu and Yunfei Chen
                 and Zhongfei Zhang and David Suter",
  title =        "Recognition of adult images, videos, and web page
                 bags",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "28:1--28:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037685",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2011:SSC,
  author =       "Yu-Ru Lin and K. Sel{\c{c}}cuk Candan and Hari
                 Sundaram and Lexing Xie",
  title =        "{SCENT}: {Scalable} compressed monitoring of evolving
                 multirelational social networks",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "29:1--29:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037686",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sang:2011:BCT,
  author =       "Jitao Sang and Changsheng Xu",
  title =        "Browse by chunks: {Topic} mining and organizing on
                 web-scale social media",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "30:1--30:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037687",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ji:2011:MFL,
  author =       "Rongrong Ji and Yue Gao and Bineng Zhong and Hongxun
                 Yao and Qi Tian",
  title =        "Mining {\tt flickr} landmarks by modeling
                 reconstruction sparsity",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "31:1--31:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037688",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mandel:2011:CTI,
  author =       "Michael I. Mandel and Razvan Pascanu and Douglas Eck
                 and Yoshua Bengio and Luca M. Aiello and Rossano
                 Schifanella and Filippo Menczer",
  title =        "Contextual tag inference",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "32:1--32:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037689",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Biel:2011:VCB,
  author =       "Joan-Isaac Biel and Daniel Gatica-Perez",
  title =        "{VlogSense}: {Conversational} behavior and social
                 attention in {YouTube}",
  journal =      j-TOMCCAP,
  volume =       "7S",
  number =       "1",
  pages =        "33:1--33:??",
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2037676.2037690",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Nov 6 06:36:59 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Anonymous:2011:TCO,
  author =       "Anonymous",
  title =        "Table of Contents: Online Supplement Volume {7S},
                 Number 1",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "34:1--34:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043620",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hong:2011:BSE,
  author =       "Richang Hong and Jinhui Tang and Hung-Khoon Tan and
                 Chong-Wah Ngo and Shuicheng Yan and Tat-Seng Chua",
  title =        "Beyond search: Event-driven summarization for {Web}
                 videos",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "35:1--35:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043613",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kuo:2011:TPQ,
  author =       "Wen-Kuang Kuo and Kuo-Wei Wu",
  title =        "Traffic prediction and {QoS} transmission of real-time
                 live {VBR} videos in {WLANs}",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "36:1--36:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043614",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Maddage:2011:BSS,
  author =       "Namunu C. Maddage and Haizhou Li",
  title =        "Beat space segmentation and octave scale cepstral
                 feature for sung language recognition in pop music",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "37:1--37:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043615",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Santini:2011:ECQ,
  author =       "Simone Santini",
  title =        "Efficient computation of queries on feature streams",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "38:1--38:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043616",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Verdugo:2011:IFC,
  author =       "Renato Verdugo and Miguel Nussbaum and Pablo Corro and
                 Pablo Nu{\~n}nez and Paula Navarrete",
  title =        "Interactive films and coconstruction",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "39:1--39:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043617",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghandeharizadeh:2011:DCC,
  author =       "Shahram Ghandeharizadeh and Shahin Shayandeh",
  title =        "Domical cooperative caching for streaming media in
                 wireless home networks",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "40:1--40:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043618",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghandeharizadeh:2011:CPS,
  author =       "Shahram Ghandeharizadeh and Shahin Shayandeh",
  title =        "Call for papers: Special issue on {$3$D} mobile
                 multimedia",
  journal =      j-TOMCCAP,
  volume =       "7",
  number =       "4",
  pages =        "41:1--41:??",
  month =        nov,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043612.2043619",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 15 08:53:32 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2012:ENC,
  author =       "Ralf Steinmetz",
  title =        "Editorial note and call for nominations: {Nicolas D.
                 Georganas} best paper award",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071397",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghinea:2012:SSS,
  author =       "Georghita Ghinea and Oluwakemi Ademoye",
  title =        "The sweet smell of success: Enhancing multimedia
                 applications with olfaction",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071398",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Olfaction, or smell, is one of the last challenges
                 which multimedia applications have to conquer. As far
                 as computerized smell is concerned, there are several
                 difficulties to overcome, particularly those associated
                 with the ambient nature of smell. In this article, we
                 present results from an empirical study exploring
                 users' perception of olfaction-enhanced multimedia
                 displays. Findings show that olfaction significantly
                 adds to the user multimedia experience. Moreover, use
                 of olfaction leads to an increased sense of reality and
                 relevance. Our results also show that users are
                 tolerant of the interference and distortion effects
                 caused by olfactory effect in multimedia.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hefeeda:2012:DET,
  author =       "Mohamed Hefeeda and Cheng-Hsin Hsu",
  title =        "Design and evaluation of a testbed for mobile {TV}
                 networks",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071399",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents the design of a complete,
                 open-source, testbed for broadcast networks that offer
                 mobile TV services. Although basic architectures and
                 protocols have been developed for such networks,
                 detailed performance tuning and analysis are still
                 needed, especially when these networks scale to serve
                 many diverse TV channels to numerous subscribers. The
                 detailed performance analysis could also motivate
                 designing new protocols and algorithms for enhancing
                 future mobile TV networks. Currently, many researchers
                 evaluate the performance of mobile TV networks using
                 simulation and/or theoretical modeling methods. These
                 methods, while useful for early assessment, typically
                 abstract away many necessary details of actual, fairly
                 complex, networks. Therefore, an open-source platform
                 for evaluating new ideas in a real mobile TV network is
                 needed. This platform is currently not possible with
                 commercial products, because they are sold as black
                 boxes without the source code. In this article, we
                 summarize our experiences in designing and implementing
                 a testbed for mobile TV networks. We integrate
                 off-the-shelf hardware components with carefully
                 designed software modules to realize a scalable testbed
                 that covers almost all aspects of real networks. We use
                 our testbed to empirically analyze various performance
                 aspects of mobile TV networks and validate/refute
                 several claims made in the literature as well as
                 discover/quantify multiple important performance
                 tradeoffs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2012:DMS,
  author =       "Yu-Ru Lin and Hari Sundaram and Munmun {De Choudhury}
                 and Aisling Kelliher",
  title =        "Discovering multirelational structure in social media
                 streams",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071400",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we present a novel algorithm to
                 discover multirelational structures from social media
                 streams. A media item such as a photograph exists as
                 part of a meaningful interrelationship among several
                 attributes, including time, visual content, users, and
                 actions. Discovery of such relational structures
                 enables us to understand the semantics of human
                 activity and has applications in content organization,
                 recommendation algorithms, and exploratory social
                 network analysis. We are proposing a novel nonnegative
                 matrix factorization framework to characterize
                 relational structures of group photo streams. The
                 factorization incorporates image content features and
                 contextual information. The idea is to consider a
                 cluster as having similar relational patterns; each
                 cluster consists of photos relating to similar content
                 or context. Relations represent different aspects of
                 the photo stream data, including visual content,
                 associated tags, photo owners, and post times. The
                 extracted structures minimize the mutual information of
                 the predicted joint distribution. We also introduce a
                 relational modularity function to determine the
                 structure cost penalty, and hence determine the number
                 of clusters. Extensive experiments on a large Flickr
                 dataset suggest that our approach is able to extract
                 meaningful relational patterns from group photo
                 streams. We evaluate the utility of the discovered
                 structures through a tag prediction task and through a
                 user study. Our results show that our method based on
                 relational structures, outperforms baseline methods,
                 including feature and tag frequency based techniques,
                 by 35\%--420\%. We have conducted a qualitative user
                 study to evaluate the benefits of our framework in
                 exploring group photo streams. The study indicates that
                 users found the extracted clustering results clearly
                 represent major themes in a group; the clustering
                 results not only reflect how users describe the group
                 data but often lead the users to discover the evolution
                 of the group activity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheng:2012:EIC,
  author =       "Xu Cheng and Jiangchuan Liu",
  title =        "Exploring interest correlation for peer-to-peer
                 socialized video sharing",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071401",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The last five years have witnessed an explosion of
                 networked video sharing, represented by YouTube, as a
                 new killer Internet application. Their sustainable
                 development however is severely hindered by the
                 intrinsic limit of their client/server architecture. A
                 shift to the peer-to-peer paradigm has been widely
                 suggested with success already shown in live video
                 streaming and movie-on-demand. Unfortunately, our
                 latest measurement demonstrates that short video clips
                 exhibit drastically different statistics, which would
                 simply render these existing solutions suboptimal, if
                 not entirely inapplicable. Our long-term measurement
                 over five million YouTube videos, on the other hand,
                 reveals interesting social networks with strong
                 correlation among the videos, thus opening new
                 opportunities to explore. In this article, we present
                 NetTube, a novel peer-to-peer assisted delivering
                 framework that explores the user interest correlation
                 for short video sharing. We address a series of key
                 design issues to realize the system, including a
                 bi-layer overlay, an efficient indexing scheme, a
                 delay-aware scheduling mechanism, and a prefetching
                 strategy leveraging interest correlation. We evaluate
                 NetTube through both simulations and prototype
                 experiments, which show that it greatly reduces the
                 server workload, improves the playback quality and
                 scales well.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mei:2012:ITC,
  author =       "Tao Mei and Lusong Li and Xian-Sheng Hua and Shipeng
                 Li",
  title =        "{ImageSense}: Towards contextual image advertising",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071402",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The daunting volumes of community-contributed media
                 contents on the Internet have become one of the primary
                 sources for online advertising. However, conventional
                 advertising treats image and video advertising as
                 general text advertising by displaying relevant ads
                 based on the contents of the Web page, without
                 considering the inherent characteristics of visual
                 contents. This article presents a contextual
                 advertising system driven by images, which
                 automatically associates relevant ads with an image
                 rather than the entire text in a Web page and
                 seamlessly inserts the ads in the nonintrusive areas
                 within each individual image. The proposed system,
                 called ImageSense, supports scalable advertising of,
                 from root to node, Web sites, pages, and images. In
                 ImageSense, the ads are selected based on not only
                 textual relevance but also visual similarity, so that
                 the ads yield contextual relevance to both the text in
                 the Web page and the image content. The ad insertion
                 positions are detected based on image salience, as well
                 as face and text detection, to minimize intrusiveness
                 to the user. We evaluate ImageSense on a large-scale
                 real-world images and Web pages, and demonstrate the
                 effectiveness of ImageSense for online image
                 advertising.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Snidaro:2012:FMV,
  author =       "Lauro Snidaro and Ingrid Visentini and Gian Luca
                 Foresti",
  title =        "Fusing multiple video sensors for surveillance",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071403",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Real-time detection, tracking, recognition, and
                 activity understanding of moving objects from multiple
                 sensors represent fundamental issues to be solved in
                 order to develop surveillance systems that are able to
                 autonomously monitor wide and complex environments. The
                 algorithms that are needed span therefore from image
                 processing to event detection and behaviour
                 understanding, and each of them requires dedicated
                 study and research. In this context, sensor fusion
                 plays a pivotal role in managing the information and
                 improving system performance. Here we present a novel
                 fusion framework for combining the data coming from
                 multiple and possibly heterogeneous sensors observing a
                 surveillance area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2012:TAM,
  author =       "Jiun-Long Huang and Shih-Chuan Chiu and Man-Kwan
                 Shan",
  title =        "Towards an automatic music arrangement framework using
                 score reduction",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2071396.2071404",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:02 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Score reduction is a process that arranges music for a
                 target instrument by reducing original music. In this
                 study we present a music arrangement framework that
                 uses score reduction to automatically arrange music for
                 a target instrument. The original music is first
                 analyzed to determine the type of arrangement element
                 of each section, then the phrases are identified and
                 each is assigned a utility according to its type of
                 arrangement element. For a set of utility-assigned
                 phrases, we transform the music arrangement into an
                 optimization problem and propose a phrase selection
                 algorithm. The music is arranged by selecting
                 appropriate phrases satisfying the playability
                 constraints of a target instrument. Using the proposed
                 framework, we implement a music arrangement system for
                 the piano. An approach similar to Turing test is used
                 to evaluate the quality of the music arranged by our
                 system. The experiment results show that our system is
                 able to create viable music for the piano.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2012:EN,
  author =       "Ralf Steinmetz",
  title =        "Editorial note",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089086",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2012:BET,
  author =       "Dongyu Liu and Fei Li and Bo Shen and Songqing Chen",
  title =        "Building an efficient transcoding overlay for {P2P}
                 streaming to heterogeneous devices",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089087",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the increasing deployment of Internet P2P/overlay
                 streaming systems, more and more clients use mobile
                 devices, such as smart phones and PDAs, to access these
                 Internet streaming services. Compared to wired
                 desktops, mobile devices normally have a smaller screen
                 size, a less color depth, and lower bandwidth and thus
                 cannot correctly and effectively render and display the
                 data streamed to desktops. To address this problem, in
                 this paper, we propose PAT (Peer-Assisted Transcoding)
                 to enable effective online transcoding in P2P/overlay
                 streaming. PAT has the following unique features.
                 First, it leverages active peer cooperation without
                 demanding infrastructure support such as transcoding
                 servers. Second, as online transcoding is
                 computationally intensive while the various devices
                 used by participating clients may have limited
                 computing power and related resources (e.g., battery,
                 bandwidth), an additional overlay, called metadata
                 overlay, is constructed to instantly share the
                 intermediate transcoding result of a transcoding
                 procedure with other transcoding nodes to minimize the
                 total computing overhead in the system. The
                 experimental results collected within a realistically
                 simulated testbed show that by consuming 6\% extra
                 bandwidth, PAT could save up to 58\% CPU cycles for
                 online transcoding.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2012:IFP,
  author =       "Zhijie Shen and Roger Zimmermann",
  title =        "{ISP}-friendly {P2P} live streaming: a roadmap to
                 realization",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089088",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Peer-to-Peer (P2P) applications generate large amounts
                 of Internet network traffic. The wide-reaching
                 connectivity of P2P systems is creating resource
                 inefficiencies for network providers. Recent studies
                 have demonstrated that localizing cross-ISP (Internet
                 service provider) traffic can mitigate this challenge.
                 However, bandwidth sensitivity and display quality
                 requirements complicate the ISP-friendly design for
                 live streaming systems. To this date, although some
                 prior techniques focusing on live streaming systems
                 exist, the correlation between traffic localization and
                 streaming quality guarantee has not been well explored.
                 Additionally, the proposed solutions are often not easy
                 to apply in practice. In our presented work, we
                 demonstrate that the cross-ISP traffic of P2P live
                 streaming systems can be significantly reduced with
                 little impact on the streaming quality. First, we
                 analytically investigate and quantify the tradeoff
                 between traffic localization and streaming quality
                 guarantee, determining the lower bound of the inter-AS
                 (autonomous system) streaming rate below which
                 streaming quality cannot be preserved. Based on the
                 analysis, we further propose a practical ISP-friendly
                 solution, termed IFPS, which requires only minor
                 changes to the peer selection mechanism and can easily
                 be integrated into both new and existing systems.
                 Additionally, the significant opportunity for
                 localizing traffic is underscored by our collected
                 traces from PPLive, which also enabled us to derive
                 realistic parameters to guide our simulations. The
                 experimental results demonstrate that IFPS reduces
                 cross-ISP traffic from 81\% up to 98\% while keeping
                 streaming quality virtually unaffected.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lou:2012:QDD,
  author =       "Xiaosong Lou and Kai Hwang",
  title =        "Quality of data delivery in peer-to-peer video
                 streaming",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089089",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "QoS in a P2P video streaming system is evaluated in
                 three stages: content generation, data delivery and
                 video playback. We use jitter-free probability as the
                 main performance metric to study Quality of Data
                 delivery (QoD). A new model that incorporates both
                 bandwidth and data availability of P2P network is
                 proposed. Our model relies on a sharing factor that
                 models data availability among all peers. We simulate
                 on a minimalistic network to demonstrate how to apply
                 the analytical model to design a P2P video streaming
                 system with a very low jitter rate. Our simulation
                 experimental results reveal that the lower bound on
                 jitter-free probability is indeed effective to reflect
                 the QoD of the entire system. Our model captures the
                 impact of many design choices, including upload
                 bandwidth limit, peer selection strategies, and video
                 stream chunking schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2012:DNW,
  author =       "Chuan Wu and Baochun Li and Shuqiao Zhao",
  title =        "Diagnosing network-wide {P2P} live streaming
                 inefficiencies",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089090",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Large-scale live peer-to-peer (P2P) streaming
                 applications have been successfully deployed in today's
                 Internet. While they can accommodate hundreds of
                 thousands of users simultaneously with hundreds of
                 channels of programming, there still commonly exist
                 channels and times where and when the streaming quality
                 is unsatisfactory. In this paper, based on more than
                 two terabytes and one year worth of live traces from
                 UUSee, a large-scale commercial P2P live streaming
                 system, we show an in-depth network-wide diagnosis of
                 streaming inefficiencies, commonly present in typical
                 mesh-based P2P live streaming systems. As the first
                 highlight of our work, we identify an evolutionary
                 pattern of low streaming quality in the system, and the
                 distribution of streaming inefficiencies across various
                 streaming channels and in different geographical
                 regions. We then carry out an extensive investigation
                 to explore the causes to such streaming inefficiencies
                 over different times and across different
                 channels/regions at specific times, by investigating
                 the impact of factors such as the number of peers, peer
                 upload bandwidth, inter-peer bandwidth availability,
                 server bandwidth consumption, and many more. The
                 original discoveries we have brought forward include
                 the two-sided effects of peer population on the
                 streaming quality in a streaming channel, the
                 significant impact of inter-peer bandwidth bottlenecks
                 at peak times, and the inefficient utilization of
                 server capacities across concurrent channels. Based on
                 these insights, we identify problems within the
                 existing P2P live streaming design and discuss a number
                 of suggestions to improve real-world streaming
                 protocols operating at a large scale.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2012:ABP,
  author =       "Chuan Wu and Zongpeng Li and Xuanjia Qiu and Francis
                 C. M. Lau",
  title =        "Auction-based {P2P VoD} streaming: Incentives and
                 optimal scheduling",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089091",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Real-world large-scale Peer-to-Peer (P2P)
                 Video-on-Demand (VoD) streaming applications face more
                 design challenges as compared to P2P live streaming,
                 due to higher peer dynamics and less buffer overlap.
                 The situation is further complicated when we consider
                 the selfish nature of peers, who in general wish to
                 download more and upload less, unless otherwise
                 motivated. Taking a new perspective of distributed
                 dynamic auctions, we design efficient P2P VoD streaming
                 algorithms with simultaneous consideration of peer
                 incentives and streaming optimality. In our solution,
                 media block exchanges among peers are carried out
                 through local auctions, in which budget-constrained
                 peers bid for desired blocks from their neighbors,
                 which in turn deliver blocks to the winning bidders and
                 collect revenue. With strategic design of a
                 discriminative second price auction with seller
                 reservation, a supplying peer has full incentive to
                 maximally contribute its bandwidth to increase its
                 budget; requesting peers are also motivated to bid in
                 such a way that optimal media block scheduling is
                 achieved effectively in a fully decentralized fashion.
                 Applying techniques from convex optimization and
                 mechanism design, we prove (a) the incentive
                 compatibility at the selling and buying peers, and (b)
                 the optimality of the induced media block scheduling in
                 terms of social welfare maximization. Large-scale
                 empirical studies are conducted to investigate the
                 behavior of the proposed auction mechanisms in dynamic
                 P2P VoD systems based on real-world settings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2012:PHL,
  author =       "Tieying Zhang and Xueqi Cheng and Jianming Lv and
                 Zhenhua Li and Weisong Shi",
  title =        "Providing hierarchical lookup service for {P2P--VoD}
                 systems",
  journal =      j-TOMCCAP,
  volume =       "8s",
  number =       "1",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2089085.2089092",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Mar 16 15:56:04 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Supporting random jump in P2P-VoD systems requires
                 efficient lookup for the `best' suppliers, where `best'
                 means the suppliers should meet two requirements:
                 content match and network quality match. Most studies
                 use a DHT-based method to provide content lookup;
                 however, these methods are neither able to meet the
                 network quality requirements nor suitable for VoD
                 streaming due to the large overhead. In this paper, we
                 propose Mediacoop, a novel hierarchical lookup scheme
                 combining both content and quality match to provide
                 random jumps for P2P-VoD systems. It exploits the play
                 position to efficiently locate the candidate suppliers
                 with required data (content match), and performs
                 refined lookup within the candidates to meet quality
                 match. Theoretical analysis and simulation results show
                 that Mediacoop is able to achieve lower jump latency
                 and control overhead than the typical DHT-based method.
                 Moreover, we implement Mediacoop in a BitTorrent-like
                 P2P-VoD system called CoolFish and make optimizations
                 for such ` total cache' applications. The
                 implementation and evaluation in CoolFish show that
                 Mediacoop is able to improve user experiences,
                 especially the jump latency, which verifies the
                 practicability of our design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Anonymous:2012:TCO,
  author =       "Anonymous",
  title =        "Table of Contents: Online Supplement Volume {8S},
                 Number 1",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2169004",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dornaika:2012:IRF,
  author =       "Fadi Dornaika and James H. Elder",
  title =        "Image registration for foveated panoramic sensing",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2168997",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article addresses the problem of registering
                 high-resolution, small field-of-view images with
                 low-resolution panoramic images provided by a panoramic
                 catadioptric video sensor. Such systems may find
                 application in surveillance and telepresence systems
                 that require a large field of view and high resolution
                 at selected locations. Although image registration has
                 been studied in more conventional applications, the
                 problem of registering panoramic and conventional video
                 has not previously been addressed, and this problem
                 presents unique challenges due to (i) the extreme
                 differences in resolution between the sensors (more
                 than a 16:1 linear resolution ratio in our
                 application), and (ii) the resolution inhomogeneity of
                 panoramic images. The main contributions of this
                 article are as follows. First, we introduce our
                 foveated panoramic sensor design. Second, we show how a
                 coarse registration can be computed from the raw images
                 using parametric template matching techniques. Third,
                 we propose two refinement methods allowing automatic
                 and near real-time registration between the two image
                 streams. The first registration method is based on
                 matching extracted interest points using a closed form
                 method. The second registration method is featureless
                 and based on minimizing the intensity discrepancy
                 allowing the direct recovery of both the geometric and
                 the photometric transforms. Fourth, a comparison
                 between the two registration methods is carried out,
                 which shows that the featureless method is superior in
                 accuracy. Registration examples using the developed
                 methods are presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2012:CPC,
  author =       "Xin Zhang and Tom{\'a}s Ward and S{\'e}amus Mcloone",
  title =        "Comparison of predictive contract mechanisms from an
                 information theory perspective",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2168998",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Inconsistency arises across a Distributed Virtual
                 Environment due to network latency induced by state
                 changes communications. Predictive Contract Mechanisms
                 (PCMs) combat this problem through reducing the amount
                 of messages transmitted in return for perceptually
                 tolerable inconsistency. To date there are no methods
                 to quantify the efficiency of PCMs in communicating
                 this reduced state information. This article presents
                 an approach derived from concepts in information theory
                 for a deeper understanding of PCMs. Through a
                 comparison of representative PCMs, the worked analysis
                 illustrates interesting aspects of PCMs operation and
                 demonstrates how they can be interpreted as a form of
                 lossy information compression.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Olsen:2012:ITN,
  author =       "Dan R. Olsen and Derek Bunn and Trent Boulter and
                 Robert Walz",
  title =        "Interactive television news",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2168999",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A new interactive television experience has been
                 created for watching television news. The goal is to
                 create a news experience that is similar to the way
                 people watch television in their living rooms while
                 giving viewers the power to make choices about what
                 they see. We partnered with existing news organizations
                 to create tools consistent with current news production
                 practices. The viewer experience allows selection of
                 the order of news content, skipping unwanted content
                 and exploring stories in more depth. These tools were
                 used to produce seven days of interactive commercial
                 news that were viewed in ten homes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Armitage:2012:ROF,
  author =       "Grenville Armitage and Amiel Heyde",
  title =        "{REED}: {Optimizing} first person shooter game server
                 discovery using network coordinates",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2169000",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online First Person Shooter (FPS) games typically use
                 a client-server communication model, with thousands of
                 enthusiast-hosted game servers active at any time.
                 Traditional FPS server discovery may take minutes, as
                 clients create thousands of short-lived packet flows
                 while probing all available servers to find a selection
                 of game servers with tolerable round trip time (RTT).
                 REED reduces a client's probing time and network
                 traffic to 1\% of traditional server discovery. REED
                 game servers participate in a centralized, incremental
                 calculation of their network coordinates, and clients
                 use these coordinates to expedite the discovery of
                 servers with low RTTs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2012:ILC,
  author =       "Xiaobai Liu and Shuicheng Yan and Tat-Seng Chua and
                 Hai Jin",
  title =        "Image label completion by pursuing contextual
                 decomposability",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "21:1--21:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2169001",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article investigates how to automatically
                 complete the missing labels for the partially annotated
                 images, without image segmentation. The label
                 completion procedure is formulated as a nonnegative
                 data factorization problem, to decompose the global
                 image representations that are used for describing the
                 entire images, for instance, various image feature
                 descriptors, into their corresponding label
                 representations, that are used for describing the local
                 semantic regions within images. The solution provided
                 in this work is motivated by following observations.
                 First, label representations of the regions with the
                 same label often share certain commonness, yet may be
                 essentially different due to the large intraclass
                 variations. Thus, each label or concept should be
                 represented by using a subspace spanned by an ensemble
                 of basis, instead of a single one, to characterize the
                 intralabel diversities. Second, the subspaces for
                 different labels are different from each other. Third,
                 while two images are similar with each other, the
                 corresponding label representations should be similar.
                 We formulate this cross-image context as well as the
                 given partial label annotations in the framework of
                 nonnegative data factorization and then propose an
                 efficient multiplicative nonnegative update rules to
                 alternately optimize the subspaces and the
                 reconstruction coefficients. We also provide the
                 theoretic proof of algorithmic convergence and
                 correctness. Extensive experiments over several
                 challenging image datasets clearly demonstrate the
                 effectiveness of our proposed solution in boosting the
                 quality of image label completion and image annotation
                 accuracy. Based on the same formulation, we further
                 develop a label ranking algorithms, to refine the
                 noised image labels without any manual supervision. We
                 compare the proposed label ranking algorithm with the
                 state-of-the-arts over the popular evaluation databases
                 and achieve encouragingly improvements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2012:SGU,
  author =       "Yi Chen and Abhidnya A. Deshpande and Ramazan S.
                 Ayg{\"u}un",
  title =        "Sprite generation using sprite fusion",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "22:1--22:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2169002",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "There has been related research for sprite or mosaic
                 generation for over 15 years. In this article, we try
                 to understand the methodologies for sprite generation
                 and identify what has not actually been covered for
                 sprite generation. We first identify issues and focus
                 on the domain of videos for sprite generation. We
                 introduce a novel sprite fusion method that blends two
                 sprites. Sprite fusion method produces good results for
                 tracking videos and does not require object
                 segmentation. We present sample results of our
                 experiments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Weng:2012:CVR,
  author =       "Ming-Fang Weng and Yung-Yu Chuang",
  title =        "Collaborative video reindexing via matrix
                 factorization",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2",
  pages =        "23:1--23:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2168996.2169003",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:03 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Concept-based video indexing generates a matrix of
                 scores predicting the possibilities of concepts
                 occurring in video shots. Based on the idea of
                 collaborative filtering, this article presents
                 unsupervised methods to refine the initial scores
                 generated by concept classifiers by taking into account
                 the concept-to-concept correlation and shot-to-shot
                 similarity embedded within the score matrix. Given a
                 noisy matrix, we refine the inaccurate scores via
                 matrix factorization. This method is further improved
                 by learning multiple local models and incorporating
                 contextual-temporal structures. Experiments on the
                 TRECVID 2006--2008 datasets demonstrate relative
                 performance gains ranging from 13\% to 52\% without
                 using any user annotations or external knowledge
                 resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kankanhalli:2012:ISI,
  author =       "Mohan S. Kankanhalli",
  title =        "Introduction to special issue on multimedia security",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "31:1--31:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344437",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Weir:2012:IHV,
  author =       "Jonathan Weir and Weiqi Yan and Mohan S. Kankanhalli",
  title =        "Image hatching for visual cryptography",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "32:1--32:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344438",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Image hatching (or nonphotorealistic line-art) is a
                 technique widely used in the printing or engraving of
                 currency. Diverse styles of brush strokes have
                 previously been adopted for different areas of an image
                 to create aesthetically pleasing textures and shading.
                 Because there is no continuous tone within these types
                 of images, a multilevel scheme is proposed, which uses
                 different textures based on a threshold level. These
                 textures are then applied to the different levels and
                 are then combined to build up the final hatched image.
                 The proposed technique allows a secret to be hidden
                 using Visual Cryptography (VC) within the hatched
                 images. Visual cryptography provides a very powerful
                 means by which one secret can be distributed into two
                 or more pieces known as shares. When the shares are
                 superimposed exactly together, the original secret can
                 be recovered without computation. Also provided is a
                 comparison between the original grayscale images and
                 the resulting hatched images that are generated by the
                 proposed algorithm. This reinforces that the overall
                 quality of the hatched scheme is sufficient. The
                 Structural SIMilarity index (SSIM) is used to perform
                 this comparison.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2012:RIB,
  author =       "Jian Li and Hongmei Liu and Jiwu Huang and Yun Q.
                 Shi",
  title =        "Reference index-based {H.264} video watermarking
                 scheme",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "33:1--33:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344439",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video watermarking has received much attention over
                 the past years as a promising solution to copy
                 protection. Watermark robustness is still a key issue
                 of research, especially when a watermark is embedded in
                 the compressed video domain. In this article, a robust
                 watermarking scheme for H.264 video is proposed. During
                 video encoding, the watermark is embedded in the index
                 of the reference frame, referred to as reference index,
                 a bitstream syntax element newly proposed in the H.264
                 standard. Furthermore, the video content (current coded
                 blocks) is modified based on an optimization model,
                 aiming at improving watermark robustness without
                 unacceptably degrading the video's visual quality or
                 increasing the video's bit rate. Compared with the
                 existing schemes, our method has the following three
                 advantages: (1) The bit rate of the watermarked video
                 is adjustable; (2) the robustness against common video
                 operations can be achieved; (3) the watermark embedding
                 and extraction are simple. Extensive experiments have
                 verified the good performance of the proposed
                 watermarking scheme.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gao:2012:RHC,
  author =       "Xifeng Gao and Caiming Zhang and Yan Huang and Zhigang
                 Deng",
  title =        "A robust high-capacity affine-transformation-invariant
                 scheme for watermarking {$3$D} geometric models",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "34:1--34:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344440",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article we propose a novel, robust, and
                 high-capacity watermarking method for 3D meshes with
                 arbitrary connectivities in the spatial domain based on
                 affine invariants. Given a 3D mesh model, a watermark
                 is embedded as affine-invariant length ratios of one
                 diagonal segment to the residing diagonal intersected
                 by the other one in a coplanar convex quadrilateral. In
                 the extraction process, a watermark is recovered by
                 combining all the watermark pieces embedded in length
                 ratios through majority voting. Extensive experimental
                 results demonstrate the robustness, high computational
                 efficiency, high capacity, and
                 affine-transformation-invariant characteristics of the
                 proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2012:EMA,
  author =       "Rui Yang and Zhenhua Qu and Jiwu Huang",
  title =        "Exposing {MP3} audio forgeries using frame offsets",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "35:1--35:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344441",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Audio recordings should be authenticated before they
                 are used as evidence. Although audio watermarking and
                 signature are widely applied for authentication, these
                 two techniques require accessing the original audio
                 before it is published. Passive authentication is
                 necessary for digital audio, especially for the most
                 popular audio format: MP3. In this article, we propose
                 a passive approach to detect forgeries of MP3 audio.
                 During the process of MP3 encoding the audio samples
                 are divided into frames, and thus each frame has its
                 own frame offset after encoding. Forgeries lead to the
                 breaking of framing grids. So the frame offset is a
                 good indication for locating forgeries, and it can be
                 retrieved by the identification of the quantization
                 characteristic. In this way, the doctored positions can
                 be automatically located. Experimental results
                 demonstrate that the proposed approach is effective in
                 detecting some common forgeries, such as deletion,
                 insertion, substitution, and splicing. Even when the
                 bit rate is as low as 32 kbps, the detection rate is
                 above 99\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Feng:2012:CAO,
  author =       "Hui Feng and Hefei Ling and Fuhao Zou and Weiqi Yan
                 and Zhengding Lu",
  title =        "A collusion attack optimization strategy for digital
                 fingerprinting",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "36:1--36:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344442",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Collusion attack is a cost-efficient attack for
                 digital fingerprinting. In this article, we propose a
                 novel collusion attack strategy, Iterative Optimization
                 Collusion Attack (IOCA), which is based upon the
                 gradient attack and the principle of informed watermark
                 embedding. We evaluate the performance of the proposed
                 collusion attack strategy in defeating four typical
                 fingerprinting schemes under a well-constructed
                 evaluation framework. The simulation results show that
                 the proposed strategy performs more effectively than
                 the gradient attack, and adopting no more than three
                 fingerprinted copies can sufficiently collapse examined
                 fingerprinting schemes. Meanwhile, the content resulted
                 from the proposed attack still preserves high
                 perceptual quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sachan:2012:ALV,
  author =       "Amit Sachan and Sabu Emmanuel and Mohan S.
                 Kankanhalli",
  title =        "Aggregate licenses validation for digital rights
                 violation detection",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "2S",
  pages =        "37:1--37:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2344436.2344443",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:05 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Digital Rights Management (DRM) is the term associated
                 with the set of technologies to prevent illegal
                 multimedia content distribution and consumption. DRM
                 systems generally involve multiple parties such as
                 owner, distributors, and consumers. The owner issues
                 redistribution licenses to its distributors. The
                 distributors in turn using their received
                 redistribution licenses can generate and issue new
                 redistribution licenses to other distributors and new
                 usage licenses to consumers. As a part of rights
                 violation detection, these newly generated licenses
                 must be validated by a validation authority against the
                 redistribution license used to generate them. The
                 validation of these newly generated licenses becomes
                 quite complex when there exist multiple redistribution
                 licenses for a media with the distributors. In such
                 cases, the validation process requires validation using
                 an exponential number (to the number of redistribution
                 licenses) of validation inequalities and each
                 validation inequality may contain up to an exponential
                 number of summation terms. This makes the validation
                 process computationally intensive and necessitates to
                 do the validation efficiently. To overcome this, we
                 propose validation tree, a prefix-tree-based validation
                 method to do the validation efficiently. Theoretical
                 analysis and experimental results show that our
                 proposed technique reduces the validation time
                 significantly.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Riiser:2012:VSU,
  author =       "Haakon Riiser and Tore Endestad and Paul Vigmostad and
                 Carsten Griwodz and P{\^a}l Halvorsen",
  title =        "Video streaming using a location-based
                 bandwidth-lookup service for bitrate planning",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240137",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A lot of people around the world commute using public
                 transportation and would like to spend this time
                 viewing streamed video content such as news or sports
                 updates. However, mobile wireless networks typically
                 suffer from severe bandwidth fluctuations, and the
                 networks are often completely unresponsive for several
                 seconds, sometimes minutes. Today, there are several
                 ways of adapting the video bitrate and thus the video
                 quality to such fluctuations, for example, using
                 scalable video codecs or segmented adaptive HTTP
                 streaming that switches between nonscalable video
                 streams encoded in different bitrates. Still, for a
                 better long-term video playout experience that avoids
                 disruptions and frequent quality changes while using
                 existing video adaptation technology, it is desirable
                 to perform bandwidth prediction and planned quality
                 adaptation. This article describes a video streaming
                 system for receivers equipped with a GPS. A receiver's
                 download rate is constantly monitored, and periodically
                 reported back to a central database along with
                 associated GPS positional data. Thus, based on the
                 current location, a streaming device can use a
                 GPS-based bandwidth-lookup service in order to better
                 predict the near-future bandwidth availability and
                 create a schedule for the video playout that takes
                 likely future availability into account. To create a
                 prototype and perform initial tests, we conducted
                 several field trials while commuting using public
                 transportation. We show how our database has been used
                 to predict bandwidth fluctuations and network outages,
                 and how this information helps maintain uninterrupted
                 playback with less compromise on video quality than
                 possible without prediction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Valdes:2012:AEV,
  author =       "Victor Valdes and Jose M. Martinez",
  title =        "Automatic evaluation of video summaries",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "25:1--25:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240138",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article describes a method for the automatic
                 evaluation of video summaries based on the training of
                 individual predictors for different quality measures
                 from the TRECVid 2008 BBC Rushes Summarization Task.
                 The obtained results demonstrate that, with a large set
                 of evaluation data, it is possible to train fully
                 automatic evaluation systems based on visual features
                 automatically extracted from the summaries. The
                 proposed approach will enable faster and easier
                 estimation of the results of newly developed
                 abstraction algorithms and the study of which summary
                 characteristics influence their perceived quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tian:2012:STL,
  author =       "Xinmei Tian and Dacheng Tao and Yong Rui",
  title =        "Sparse transfer learning for interactive video search
                 reranking",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "26:1--26:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240139",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Visual reranking is effective to improve the
                 performance of the text-based video search. However,
                 existing reranking algorithms can only achieve limited
                 improvement because of the well-known semantic gap
                 between low-level visual features and high-level
                 semantic concepts. In this article, we adopt
                 interactive video search reranking to bridge the
                 semantic gap by introducing user's labeling effort. We
                 propose a novel dimension reduction tool, termed sparse
                 transfer learning (STL), to effectively and efficiently
                 encode user's labeling information. STL is particularly
                 designed for interactive video search reranking.
                 Technically, it (a) considers the pair-wise
                 discriminative information to maximally separate
                 labeled query relevant samples from labeled query
                 irrelevant ones, (b) achieves a sparse representation
                 for the subspace to encodes user's intention by
                 applying the elastic net penalty, and (c) propagates
                 user's labeling information from labeled samples to
                 unlabeled samples by using the data distribution
                 knowledge. We conducted extensive experiments on the
                 TRECVID 2005, 2006 and 2007 benchmark datasets and
                 compared STL with popular dimension reduction
                 algorithms. We report superior performance by using the
                 proposed STL-based interactive video search
                 reranking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2012:IBD,
  author =       "Xin Zhang and Tom{\'a}s E. Ward and S{\'e}amus
                 Mcloone",
  title =        "An information-based dynamic extrapolation model for
                 networked virtual environments",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "27:1--27:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240140",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Various Information Management techniques have been
                 developed to help maintain a consistent shared virtual
                 world in a Networked Virtual Environment. However, such
                 techniques have to be carefully adapted to the
                 application state dynamics and the underlying network.
                 This work presents a novel framework that minimizes
                 inconsistency by optimizing bandwidth usage to deliver
                 useful information. This framework measures the state
                 evolution using an information model and dynamically
                 switches extrapolation models and the packet rate to
                 make the most information-efficient usage of the
                 available bandwidth. The results shown demonstrate that
                 this approach can help optimize consistency under
                 constrained and time-varying network conditions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2012:UCM,
  author =       "Linjun Yang and Bo Geng and Alan Hanjalic and
                 Xian-Sheng Hua",
  title =        "A unified context model for web image retrieval",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "28:1--28:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240141",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Content-based web image retrieval based on the
                 query-by-example (QBE) principle remains a challenging
                 problem due to the semantic gap as well as the gap
                 between a user's intent and the representativeness of a
                 typical image query. In this article, we propose to
                 address this problem by integrating query-related
                 contextual information into an advanced query model to
                 improve the performance of QBE-based web image
                 retrieval. We consider both the local and global
                 context of the query image. The local context can be
                 inferred from the web pages and the click-through log
                 associated with the query image, while the global
                 context is derived from the entire corpus comprising
                 all web images and the associated web pages. To
                 effectively incorporate the local query context we
                 propose a language modeling based approach to deal with
                 the combined structured query representation from the
                 contextual and visual information. The global query
                 context is integrated by the multi-modal relevance
                 model to ``reconstruct'' the query from the document
                 models indexed in the corpus. In this way, the global
                 query context is employed to address the noise or
                 missing information in the query and its local context,
                 so that a comprehensive and robust query model can be
                 obtained. We evaluated the proposed approach on a
                 representative product image dataset collected from the
                 web and demonstrated that the inclusion of the local
                 and global query contexts significantly improves the
                 performance of QBE-based web image retrieval.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Patras:2012:CTS,
  author =       "Paul Patras and Albert Banchs and Pablo Serrano",
  title =        "A control theoretic scheme for efficient video
                 transmission over {IEEE 802.11e EDCA WLANs}",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "29:1--29:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240142",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The EDCA mechanism of the IEEE 802.11 standard has
                 been designed to support, among others, video traffic.
                 This mechanism relies on a number of parameters whose
                 configuration is left open by the standard. Although
                 there are some recommended values for these parameters,
                 they are fixed independent of the WLAN conditions,
                 which results in suboptimal performance. Following this
                 observation, a number of approaches in the literature
                 have been devised to set the EDCA parameters based on
                 an estimation of the WLAN conditions. However, these
                 previous approaches are based on heuristics and hence
                 do not guarantee optimized performance. In this article
                 we propose a novel algorithm to adjust the EDCA
                 parameters to carry video traffic which, in contrast to
                 previous approaches, is sustained on mathematical
                 foundations that guarantee optimal performance. In
                 particular, our approach builds upon (i) an analytical
                 model of the WLAN performance under video traffic, used
                 to derive the optimal point of operation of EDCA, and
                 (ii) a control theoretic designed mechanism which
                 drives the WLAN to this point of operation. Via
                 extensive simulations, we show that the proposed
                 approach performs optimally and substantially
                 outperforms the standard recommended configuration as
                 well as previous adaptive proposals.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhu:2012:JLS,
  author =       "Xinglei Zhu and Chang W. Chen",
  title =        "A joint layered scheme for reliable and secure mobile
                 {JPEG-2000} streaming",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3",
  pages =        "30:1--30:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2240136.2240143",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:06 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel joint layered approach
                 to simultaneously achieve both reliable and secure
                 mobile JPEG-2000 image streaming. With a priori
                 knowledge of JPEG-2000 source coding and channel
                 coding, the proposed joint system integrates
                 authentication into the media error protection
                 components to ensure that every source-decodable media
                 unit is authenticated. By such a dedicated design, the
                 proposed scheme protects both compressed JPEG-2000
                 codestream and the authentication data from wireless
                 channel impairments. It is fundamentally different from
                 many existing systems that consider the problem of
                 media authentication separately from the other
                 operations in the media transmission system. By
                 utilizing the contextual relationship, such as coding
                 dependency and content importance between media slices
                 for authentication hash appending, the proposed scheme
                 generates an extremely low authentication overhead.
                 Under this joint layered coding framework, an optimal
                 rate allocation algorithm for source coding, channel
                 coding, and media authentication is developed to
                 guarantee end-to-end media quality. Experiment results
                 on JPEG-2000 images validate the proposed scheme and
                 demonstrate that the performance of the proposed scheme
                 is approaching its upper bound, in which case no
                 authentication is applied to the media stream.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gatica-Perez:2012:ISS,
  author =       "Daniel Gatica-Perez and Gang Hua and Wei Tsang Ooi and
                 P{\aa}l Halvorsen",
  title =        "Introduction to the special section of best papers of
                 {ACM Multimedia 2011}",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "38:1--38:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348817",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2012:CPA,
  author =       "Wanmin Wu and Ahsan Arefin and Gregorij Kurillo and
                 Pooja Agarwal and Klara Nahrstedt and Ruzena Bajcsy",
  title =        "{CZLoD}: a psychophysical approach for {$3$D}
                 tele-immersive video",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "39:1--39:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348818",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a psychophysical study that
                 measures the perceptual thresholds of a new factor
                 called Color-plus-Depth Level-of-Details (CZLoD)
                 peculiar to polygon-based 3D tele-immersive video. The
                 results demonstrate the existence of Just Noticeable
                 Degradation and Just Unacceptable Degradation
                 thresholds on the factor. In light of the results, we
                 design and implement a real-time perception-based
                 quality adaptor for 3D tele-immersive video. Our
                 experimental results show that the adaptation scheme
                 can reduce resource usage (e.g., CPU cycles) while
                 considerably enhancing the overall perceived visual
                 quality. Our analysis confirms the potential temporal
                 and spatial performance benefits achievable with CZLoD
                 adaptation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ji:2012:AQS,
  author =       "Rongrong Ji and Felix X. Yu and Tongtao Zhang and
                 Shih-Fu Chang",
  title =        "Active query sensing: {Suggesting} the best query view
                 for mobile visual search",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "40:1--40:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348819",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "While much exciting progress is being made in mobile
                 visual search, one important question has been left
                 unexplored in all current systems. When searching
                 objects or scenes in the 3D world, which viewing angle
                 is more likely to be successful? More particularly, if
                 the first query fails to find the right target, how
                 should the user control the mobile camera to form the
                 second query? In this article, we propose a novel
                 Active Query Sensing system for mobile location search,
                 which actively suggests the best subsequent query view
                 to recognize the physical location in the mobile
                 environment. The proposed system includes two unique
                 components: (1) an offline process for analyzing the
                 saliencies of different views associated with each
                 geographical location, which predicts the location
                 search precisions of individual views by modeling their
                 self-retrieval score distributions. (2) an online
                 process for estimating the view of an unseen query, and
                 suggesting the best subsequent view change.
                 Specifically, the optimal viewing angle change for the
                 next query can be formulated as an online information
                 theoretic approach. Using a scalable visual search
                 system implemented over a NYC street view dataset (0.3
                 million images), we show a performance gain by reducing
                 the failure rate of mobile location search to only 12\%
                 after the second query. We have also implemented an
                 end-to-end functional system, including user interfaces
                 on iPhones, client-server communication, and a remote
                 search server. This work may open up an exciting new
                 direction for developing interactive mobile media
                 applications through the innovative exploitation of
                 active sensing and query formulation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shirmohammadi:2012:ISS,
  author =       "Shervin Shirmohammadi and Mohamed Hefeeda and Wei
                 Tsang Ooi and Romulus Grigoras",
  title =        "Introduction to special section on {$3$D} mobile
                 multimedia",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "41:1--41:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348820",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2012:QOV,
  author =       "Yanwei Liu and Song Ci and Hui Tang and Yun Ye and
                 Jinxia Liu",
  title =        "{QoE}-oriented {$3$D} video transcoding for mobile
                 streaming",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "42:1--42:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348821",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With advance in mobile 3D display, mobile 3D video is
                 already enabled by the wireless multimedia networking,
                 and it will be gradually popular since it can make
                 people enjoy the natural 3D experience anywhere and
                 anytime. In current stage, mobile 3D video is generally
                 delivered over the heterogeneous network combined by
                 wired and wireless channels. How to guarantee the
                 optimal 3D visual quality of experience (QoE) for the
                 mobile 3D video streaming is one of the important
                 topics concerned by the service provider. In this
                 article, we propose a QoE-oriented transcoding approach
                 to enhance the quality of mobile 3D video service. By
                 learning the pre-controlled QoE patterns of 3D
                 contents, the proposed 3D visual QoE inferring model
                 can be utilized to regulate the transcoding
                 configurations in real-time according to the feedbacks
                 of network and user-end device information. In the
                 learning stage, we propose a piecewise linear mean
                 opinion score (MOS) interpolation method to further
                 reduce the cumbersome manual work of preparing QoE
                 patterns. Experimental results show that the proposed
                 transcoding approach can provide the adapted 3D stream
                 to the heterogeneous network, and further provide
                 superior QoE performance to the fixed quantization
                 parameter (QP) transcoding and mean squared error (MSE)
                 optimized transcoding for mobile 3D video streaming.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2012:NVT,
  author =       "Shujie Liu and Chang Wen Chen",
  title =        "A novel {$3$D} video transcoding scheme for adaptive
                 {$3$D} video transmission to heterogeneous terminals",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "43:1--43:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348822",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Three-dimensional video (3DV) is attracting many
                 interests with its enhanced viewing experience and more
                 user driven features. 3DV has several unique
                 characteristics different from 2D video: (1) It has a
                 much larger amount of data captured and compressed, and
                 corresponding video compression techniques can be much
                 more complicated in order to explore data redundancy.
                 This will lead to more constraints on users' network
                 access and computational capability, (2) Most users
                 only need part of the 3DV data at any given time, while
                 the users' requirements exhibit large diversity, (3)
                 Only a limited number of views are captured and
                 transmitted for 3DV. View rendering is thus necessary
                 to generate virtual views based on the received 3DV
                 data. However, many terminal devices do not have the
                 functionality to generate virtual views. To enable 3DV
                 experience for the majority of users with limited
                 capabilities, adaptive 3DV transmission is necessary to
                 extract/generate the required data content and
                 represent it with supported formats and bitrates for
                 heterogeneous terminal devices. 3DV transcoding is an
                 emerging and effective technique to achieve desired
                 adaptive 3DV transmission. In this article, we propose
                 the first efficient 3DV transcoding scheme that can
                 obtain any desired view, either an encoded one or a
                 virtual one, and compress it with more universal
                 H.264/AVC. The key idea of the proposed scheme is to
                 appropriately utilize motion information contained in
                 the bitstream to generate candidate motion information.
                 Original information of both the desired view and
                 reference views are used to obtain this candidate
                 information and a proper motion refinement process is
                 carried out for certain blocks. Simulation results show
                 that, compared to the straightforward cascade
                 algorithm, the proposed scheme is able to output
                 compressed bitstream of the required view with
                 significantly reduced complexity while incurring
                 negligible performance loss. Such a 3DV transcoding can
                 be applied to most gateways that usually have
                 constraints on computational complexity and time
                 delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Roodaki:2012:NMD,
  author =       "Hoda Roodaki and Mahmoud Reza Hashemi and Shervin
                 Shirmohammadi",
  title =        "A new methodology to derive objective quality
                 assessment metrics for scalable multiview {$3$D} video
                 coding",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "44:1--44:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348823",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the growing demand for 3D video, efforts are
                 underway to incorporate it in the next generation of
                 broadcast and streaming applications and standards. 3D
                 video is currently available in games, entertainment,
                 education, security, and surveillance applications. A
                 typical scenario for multiview 3D consists of several
                 3D video sequences captured simultaneously from the
                 same scene with the help of multiple cameras from
                 different positions and through different angles.
                 Multiview video coding provides a compact
                 representation of these multiple views by exploiting
                 the large amount of inter-view statistical
                 dependencies. One of the major challenges in this field
                 is how to transmit the large amount of data of a
                 multiview sequence over error prone channels to
                 heterogeneous mobile devices with different bandwidth,
                 resolution, and processing/battery power, while
                 maintaining a high visual quality. Scalable Multiview
                 3D Video Coding (SMVC) is one of the methods to address
                 this challenge; however, the evaluation of the overall
                 visual quality of the resulting scaled-down video
                 requires a new objective perceptual quality measure
                 specifically designed for scalable multiview 3D video.
                 Although several subjective and objective quality
                 assessment methods have been proposed for multiview 3D
                 sequences, no comparable attempt has been made for
                 quality assessment of scalable multiview 3D video. In
                 this article, we propose a new methodology to build
                 suitable objective quality assessment metrics for
                 different scalable modalities in multiview 3D video.
                 Our proposed methodology considers the importance of
                 each layer and its content as a quality of experience
                 factor in the overall quality. Furthermore, in addition
                 to the quality of each layer, the concept of disparity
                 between layers (inter-layer disparity) and disparity
                 between the units of each layer (intra-layer disparity)
                 is considered as an effective feature to evaluate
                 overall perceived quality more accurately. Simulation
                 results indicate that by using this methodology, more
                 efficient objective quality assessment metrics can be
                 introduced for each multiview 3D video scalable
                 modalities.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hamza:2012:EEM,
  author =       "Ahmed Hamza and Mohamed Hefeeda",
  title =        "Energy-efficient multicasting of multiview {$3$D}
                 videos to mobile devices",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "45:1--45:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348824",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Multicasting multiple video streams over wireless
                 broadband access networks enables the delivery of
                 multimedia content to large-scale user communities in a
                 cost-efficient manner. Three dimensional (3D) videos
                 are the next natural step in the evolution of digital
                 media technologies. In order to provide 3D perception,
                 3D video streams contain one or more views that greatly
                 increase their bandwidth requirements. Due to the
                 limited channel capacity and variable bit rate of the
                 videos, multicasting multiple 3D videos over wireless
                 broadband networks is a challenging problem. In this
                 article, we consider a 4G wireless access network in
                 which a number of 3D videos represented in two-view
                 plus depth format and encoded using scalable video
                 coders are multicast. We formulate the optimal 3D video
                 multicasting problem to maximize the quality of
                 rendered virtual views on the receivers' displays. We
                 show that this problem is NP-complete and present a
                 polynomial time approximation algorithm to solve it. We
                 then extend the proposed algorithm to efficiently
                 schedule the transmission of the chosen substreams from
                 each video in order to maximize the power saving on the
                 mobile receivers. Our simulation-based experimental
                 results show that our algorithm provides solutions that
                 are within 0.3 dB of the optimal solutions while
                 satisfying real-time requirements of multicast systems.
                 In addition, our algorithm results in an average power
                 consumption reduction of 86\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shi:2012:RTR,
  author =       "Shu Shi and Klara Nahrstedt and Roy Campbell",
  title =        "A real-time remote rendering system for interactive
                 mobile graphics",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "46:1--46:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mobile devices are gradually changing people's
                 computing behaviors. However, due to the limitations of
                 physical size and power consumption, they are not
                 capable of delivering a 3D graphics rendering
                 experience comparable to desktops. Many applications
                 with intensive graphics rendering workloads are unable
                 to run on mobile platforms directly. This issue can be
                 addressed with the idea of remote rendering: the heavy
                 3D graphics rendering computation runs on a powerful
                 server and the rendering results are transmitted to the
                 mobile client for display. However, the simple remote
                 rendering solution inevitably suffers from the large
                 interaction latency caused by wireless networks, and is
                 not acceptable for many applications that have very
                 strict latency requirements. In this article, we
                 present an advanced low-latency remote rendering system
                 that assists mobile devices to render interactive 3D
                 graphics in real-time. Our design takes advantage of an
                 image based rendering technique: 3D image warping, to
                 synthesize the mobile display from the depth images
                 generated on the server. The research indicates that
                 the system can successfully reduce the interaction
                 latency while maintaining the high rendering quality by
                 generating multiple depth images at the carefully
                 selected viewpoints. We study the problem of viewpoint
                 selection, propose a real-time reference viewpoint
                 prediction algorithm, and evaluate the algorithm
                 performance with real-device experiments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guan:2012:EMM,
  author =       "Wei Guan and Suya You and Ulrich Newmann",
  title =        "Efficient matchings and mobile augmented reality",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "3s",
  pages =        "47:1--47:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348816.2348826",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Nov 6 18:13:07 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the fast-growing popularity of smart phones in
                 recent years, augmented reality (AR) on mobile devices
                 is gaining more attention and becomes more demanding
                 than ever before. However, the limited processors in
                 mobile devices are not quite promising for AR
                 applications that require real-time processing speed.
                 The challenge exists due to the fact that, while fast
                 features are usually not robust enough in matchings,
                 robust features like SIFT or SURF are not
                 computationally efficient. There is always a tradeoff
                 between robustness and efficiency and it seems that we
                 have to sacrifice one for the other. While this is true
                 for most existing features, researchers have been
                 working on designing new features with both robustness
                 and efficiency. In this article, we are not trying to
                 present a completely new feature. Instead, we propose
                 an efficient matching method for robust features. An
                 adaptive scoring scheme and a more distinctive
                 descriptor are also proposed for performance
                 improvements. Besides, we have developed an outdoor
                 augmented reality system that is based on our proposed
                 methods. The system demonstrates that not only it can
                 achieve robust matchings efficiently, it is also
                 capable to handle large occlusions such as passengers
                 and moving vehicles, which is another challenge for
                 many AR applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{TOMCCAP-STAFF:2012:TCO,
  author =       "{TOMCCAP-STAFF}",
  title =        "Table of contents: Online supplement volume 8, number
                 2s, online supplement volume 8, number 3s",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "48:1--48:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2382432",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2012:E,
  author =       "Ralf Steinmetz",
  title =        "Editorial",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "49:1--49:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379791",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2012:LRC,
  author =       "Xiaobai Liu and Shuicheng Yan and Bin Cheng and Jinhui
                 Tang and Tat-Sheng Chua and Hai Jin",
  title =        "Label-to-region with continuity-biased bi-layer
                 sparsity priors",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "50:1--50:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379792",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this work, we investigate how to reassign the fully
                 annotated labels at image level to those contextually
                 derived semantic regions, namely Label-to-Region (L2R),
                 in a collective manner. Given a set of input images
                 with label annotations, the basic idea of our approach
                 to L2R is to first discover the patch correspondence
                 across images, and then propagate the common labels
                 shared in image pairs to these correlated patches.
                 Specially, our approach consists of following aspects.
                 First, each of the input images is encoded as a
                 Bag-of-Hierarchical-Patch (BOP) for capturing the rich
                 cues at variant scales, and the individual patches are
                 expressed by patch-level feature descriptors. Second,
                 we present a sparse representation formulation for
                 discovering how well an image or a semantic region can
                 be robustly reconstructed by all the other image
                 patches from the input image set. The underlying
                 philosophy of our formulation is that an image region
                 can be sparsely reconstructed with the image patches
                 belonging to the other images with common labels, while
                 the robustness in label propagation across images
                 requires that these selected patches come from very few
                 images. This preference of being sparse at both patch
                 and image level is named bi-layer sparsity prior.
                 Meanwhile, we enforce the preference of choosing
                 larger-size patches in reconstruction, referred to as
                 continuity-biased prior in this work, which may further
                 enhance the reliability of L2R assignment. Finally, we
                 harness the reconstruction coefficients to propagate
                 the image labels to the matched patches, and fuse the
                 propagation results over all patches to finalize the
                 L2R task. As a by-product, the proposed
                 continuity-biased bi-layer sparse representation
                 formulation can be naturally applied to perform image
                 annotation on new testing images. Extensive experiments
                 on three public image datasets clearly demonstrate the
                 effectiveness of our proposed framework in both L2R
                 assignment and image annotation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rooij:2012:ETS,
  author =       "Ork De Rooij and Marcel Worring",
  title =        "Efficient targeted search using a focus and context
                 video browser",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "51:1--51:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379793",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Currently there are several interactive content-based
                 video retrieval techniques and systems available.
                 However, retrieval performance depends heavily on the
                 means of interaction. We argue that effective CBVR
                 requires efficient, specialized user interfaces. In
                 this article we propose guidelines for such an
                 interface, and we propose an effective CBVR engine: the
                 ForkBrowser, which builds upon the principle of focus
                 and context. This browser is evaluated using a
                 combination of user simulation and real user
                 evaluation. Results indicate that the ideas have merit,
                 and that the browser performs very well when compared
                 to the state-of-the-art in video retrieval.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghinea:2012:UPM,
  author =       "Gheorghita Ghinea and Oluwakemi Ademoye",
  title =        "User perception of media content association in
                 olfaction-enhanced multimedia",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "52:1--52:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379794",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Olfaction is an exciting challenge facing multimedia
                 applications. In this article we have investigated user
                 perception of the association between olfactory media
                 content and video media content in olfactory-enhanced
                 multimedia. Results show that the association between
                 scent and content has a significant impact on the
                 user-perceived experience of olfactory-enhanced
                 multimedia.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Spicer:2012:NAD,
  author =       "Ryan Spicer and Yu-Ru Lin and Aisling Kelliher and
                 Hari Sundaram",
  title =        "{NextSlidePlease}: Authoring and delivering agile
                 multimedia presentations",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "53:1--53:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379795",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Presentation support tools, such as Microsoft
                 PowerPoint, pose challenges both in terms of creating
                 linear presentations from complex data and fluidly
                 navigating such linear structures when presenting to
                 diverse audiences. NextSlidePlease is a slideware
                 application that addresses these challenges using a
                 directed graph structure approach for authoring and
                 delivering multimedia presentations. The application
                 combines novel approaches for searching and analyzing
                 presentation datasets, composing meaningfully
                 structured presentations, and efficiently delivering
                 material under a variety of time constraints. We
                 introduce and evaluate a presentation analysis
                 algorithm intended to simplify the process of authoring
                 dynamic presentations, and a time management and path
                 selection algorithm that assists users in prioritizing
                 content during the presentation process. Results from
                 two comparative user studies indicate that the directed
                 graph approach promotes the creation of hyperlinks, the
                 consideration of connections between content items, and
                 a richer understanding of the time management
                 consequences of including and selecting presentation
                 material.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Qi:2012:OBI,
  author =       "Heng Qi and Keqiu Li and Yanming Shen and Wenyu Qu",
  title =        "Object-based image retrieval with kernel on adjacency
                 matrix and local combined features",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "54:1--54:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379796",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In object-based image retrieval, there are two
                 important issues: an effective image representation
                 method for representing image content and an effective
                 image classification method for processing user
                 feedback to find more images containing the
                 user-desired object categories. In the image
                 representation method, the local-based representation
                 is the best selection for object-based image retrieval.
                 As a kernel-based classification method, Support Vector
                 Machine (SVM) has shown impressive performance on image
                 classification. But SVM cannot work on the local-based
                 representation unless there is an appropriate kernel.
                 To address this problem, some representative kernels
                 are proposed in literatures. However, these kernels
                 cannot work effectively in object-based image retrieval
                 due to ignoring the spatial context and the combination
                 of local features. In this article, we present Adjacent
                 Matrix (AM) and the Local Combined Features (LCF) to
                 incorporate the spatial context and the combination of
                 local features into the kernel. We propose the AM-LCF
                 feature vector to represent image content and the
                 AM-LCF kernel to measure the similarities between
                 AM-LCF feature vectors. According to the detailed
                 analysis, we show that the proposed kernel can overcome
                 the deficiencies of existing kernels. Moreover, we
                 evaluate the proposed kernel through experiments of
                 object-based image retrieval on two public image sets.
                 The experimental results show that the performance of
                 object-based image retrieval can be improved by the
                 proposed kernel.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2012:VPA,
  author =       "Guangda Li and Meng Wang and Zheng Lu and Richang Hong
                 and Tat-Seng Chua",
  title =        "In-video product annotation with {Web} information
                 mining",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "55:1--55:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379797",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Product annotation in videos is of great importance
                 for video browsing, search, and advertisement. However,
                 most of the existing automatic video annotation
                 research focuses on the annotation of high-level
                 concepts, such as events, scenes, and object
                 categories. This article presents a novel solution to
                 the annotation of specific products in videos by mining
                 information from the Web. It collects a set of
                 high-quality training data for each product by
                 simultaneously leveraging Amazon and Google image
                 search engine. A visual signature for each product is
                 then built based on the bag-of-visual-words
                 representation of the training images. A correlative
                 sparsification approach is employed to remove noisy
                 bins in the visual signatures. These signatures are
                 used to annotate video frames. We conduct experiments
                 on more than 1,000 videos and the results demonstrate
                 the feasibility and effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gopinathan:2012:ASO,
  author =       "Ajay Gopinathan and Zongpeng Li",
  title =        "Algorithms for stochastic optimization of multicast
                 content delivery with network coding",
  journal =      j-TOMCCAP,
  volume =       "8",
  number =       "4",
  pages =        "56:1--56:??",
  month =        nov,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2379790.2379798",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:21 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The usage of network resources by content providers is
                 commonly governed by Service-Level Agreements (SLA)
                 between the content provider and the network service
                 provider. Resource usage exceeding the limits specified
                 in the SLA incurs the content provider additional
                 charges, usually at a higher cost. Hence, the content
                 provider's goal is to provision adequate resources in
                 the SLA based on forecasts of future demand. We study
                 capacity purchasing strategies when the content
                 provider employs network coded multicast as the media
                 delivery mechanism, with uncertainty in its future
                 customer set explicitly taken into consideration. The
                 latter requires the content provider to make capacity
                 provisioning decisions based on market predictions and
                 historical customer usage patterns. The probabilistic
                 element suggests a stochastic optimization approach. We
                 model this problem as a two-stage stochastic
                 optimization problem with recourse. Such optimizations
                 are \#P-hard to solve directly, and we design two
                 approximation algorithms for them. The first is a
                 heuristic algorithm that exploits properties unique to
                 network coding, so that only polynomial-time operations
                 are needed. It performs well in general scenarios, but
                 the gap from the optimal solution is not bounded by any
                 constant in the worst case. This motivates our second
                 approach, a sampling algorithm partly inspired from the
                 work of Gupta et al. [2004a]. We employ techniques from
                 duality theory in linear optimization to prove that the
                 sampling algorithm provides a 3-approximation to the
                 stochastic multicast problem. We conduct extensive
                 simulations to illustrate the efficacy of both
                 algorithms, and show that the performance of both is
                 usually within 10\% of the optimal solution in
                 practice.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hendrikx:2013:PCG,
  author =       "Mark Hendrikx and Sebastiaan Meijer and Joeri {Van Der
                 Velden} and Alexandru Iosup",
  title =        "Procedural content generation for games: a survey",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422957",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Hundreds of millions of people play computer games
                 every day. For them, game content-from 3D objects to
                 abstract puzzles-plays a major entertainment role.
                 Manual labor has so far ensured that the quality and
                 quantity of game content matched the demands of the
                 playing community, but is facing new scalability
                 challenges due to the exponential growth over the last
                 decade of both the gamer population and the production
                 costs. Procedural Content Generation for Games (PCG-G)
                 may address these challenges by automating, or aiding
                 in, game content generation. PCG-G is difficult, since
                 the generator has to create the content, satisfy
                 constraints imposed by the artist, and return
                 interesting instances for gamers. Despite a large body
                 of research focusing on PCG-G, particularly over the
                 past decade, ours is the first comprehensive survey of
                 the field of PCG-G. We first introduce a comprehensive,
                 six-layered taxonomy of game content: bits, space,
                 systems, scenarios, design, and derived. Second, we
                 survey the methods used across the whole field of PCG-G
                 from a large research body. Third, we map PCG-G methods
                 to game content layers; it turns out that many of the
                 methods used to generate game content from one layer
                 can be used to generate content from another. We also
                 survey the use of methods in practice, that is, in
                 commercial or prototype games. Fourth and last, we
                 discuss several directions for future research in
                 PCG-G, which we believe deserve close attention in the
                 near future.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2013:IRQ,
  author =       "Dong Liu and Shuicheng Yan and Rong-Rong Ji and
                 Xian-Sheng Hua and Hong-Jiang Zhang",
  title =        "Image retrieval with query-adaptive hashing",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422958",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Hashing-based approximate nearest-neighbor search may
                 well realize scalable content-based image retrieval.
                 The existing semantic-preserving hashing methods
                 leverage the labeled data to learn a fixed set of
                 semantic-aware hash functions. However, a fixed hash
                 function set is unable to well encode all semantic
                 information simultaneously, and ignores the specific
                 user's search intention conveyed by the query. In this
                 article, we propose a query-adaptive hashing method
                 which is able to generate the most appropriate binary
                 codes for different queries. Specifically, a set of
                 semantic-biased discriminant projection matrices are
                 first learnt for each of the semantic concepts, through
                 which a semantic-adaptable hash function set is learnt
                 via a joint sparsity variable selection model. At query
                 time, we further use the sparsity representation
                 procedure to select the most appropriate hash function
                 subset that is informative to the semantic information
                 conveyed by the query. Extensive experiments over three
                 benchmark image datasets well demonstrate the
                 superiority of our proposed query-adaptive hashing
                 method over the state-of-the-art ones in terms of
                 retrieval accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zheng:2013:GSD,
  author =       "Yan-Tao Zheng and Shuicheng Yan and Zheng-Jun Zha and
                 Yiqun Li and Xiangdong Zhou and Tat-Seng Chua and
                 Ramesh Jain",
  title =        "{GPSView}: a scenic driving route planner",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422959",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "GPS devices have been widely used in automobiles to
                 compute navigation routes to destinations. The
                 generated driving route targets the minimal traveling
                 distance, but neglects the sightseeing experience of
                 the route. In this study, we propose an augmented GPS
                 navigation system, GPSView, to incorporate a scenic
                 factor into the routing. The goal of GPSView is to plan
                 a driving route with scenery and sightseeing qualities,
                 and therefore allow travelers to enjoy sightseeing on
                 the drive. To do so, we first build a database of
                 scenic roadways with vistas of landscapes and sights
                 along the roadside. Specifically, we adapt an
                 attention-based approach to exploit
                 community-contributed GPS-tagged photos on the Internet
                 to discover scenic roadways. The premise is: a
                 multitude of photos taken along a roadway imply that
                 this roadway is probably appealing and catches the
                 public's attention. By analyzing the geospatial
                 distribution of photos, the proposed approach discovers
                 the roadside sight spots, or Points-Of-Interest (POIs),
                 which have good scenic qualities and visibility to
                 travelers on the roadway. Finally, we formulate scenic
                 driving route planning as an optimization task towards
                 the best trade-off between sightseeing experience and
                 traveling distance. Testing in the northern California
                 area shows that the proposed system can deliver
                 promising results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhou:2013:SMV,
  author =       "Wengang Zhou and Houqiang Li and Yijuan Lu and Qi
                 Tian",
  title =        "{SIFT} match verification by geometric coding for
                 large-scale partial-duplicate web image search",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422960",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Most large-scale image retrieval systems are based on
                 the bag-of-visual-words model. However, the traditional
                 bag-of-visual-words model does not capture the
                 geometric context among local features in images well,
                 which plays an important role in image retrieval. In
                 order to fully explore geometric context of all visual
                 words in images, efficient global geometric
                 verification methods have been attracting lots of
                 attention. Unfortunately, current existing methods on
                 global geometric verification are either
                 computationally expensive to ensure real-time response,
                 or cannot handle rotation well. To solve the preceding
                 problems, in this article, we propose a novel geometric
                 coding algorithm, to encode the spatial context among
                 local features for large-scale partial-duplicate Web
                 image retrieval. Our geometric coding consists of
                 geometric square coding and geometric fan coding, which
                 describe the spatial relationships of SIFT features
                 into three geo-maps for global verification to remove
                 geometrically inconsistent SIFT matches. Our approach
                 is not only computationally efficient, but also
                 effective in detecting partial-duplicate images with
                 rotation, scale changes, partial-occlusion, and
                 background clutter. Experiments in partial-duplicate
                 Web image search, using two datasets with one million
                 Web images as distractors, reveal that our approach
                 outperforms the baseline bag-of-visual-words approach
                 even following a RANSAC verification in mean average
                 precision. Besides, our approach achieves comparable
                 performance to other state-of-the-art global geometric
                 verification methods, for example, spatial coding
                 scheme, but is more computationally efficient.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Park:2013:ISL,
  author =       "Jong-Seung Park and Ramesh Jain",
  title =        "Identification of scene locations from geotagged
                 images",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422961",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Due to geotagging capabilities of consumer cameras, it
                 has become easy to capture the exact geometric location
                 where a picture is taken. However, the location is not
                 the whereabouts of the scene taken by the photographer
                 but the whereabouts of the photographer himself. To
                 determine the actual location of an object seen in a
                 photo some sophisticated and tiresome steps are
                 required on a special camera rig, which are generally
                 not available in common digital cameras. This article
                 proposes a novel method to determine the geometric
                 location corresponding to a specific image pixel. A new
                 technique of stereo triangulation is introduced to
                 compute the relative depth of a pixel position.
                 Geographical metadata embedded in images are utilized
                 to convert relative depths to absolute coordinates.
                 When a geographic database is available we can also
                 infer the semantically meaningful description of a
                 scene object from where the specified pixel is
                 projected onto the photo. Experimental results
                 demonstrate the effectiveness of the proposed approach
                 in accurately identifying actual locations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2013:RAA,
  author =       "Yichuan Wang and Ting-An Lin and Cheng-Hsin Hsu and
                 Xin Liu",
  title =        "Region- and action-aware virtual world clients",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422962",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We propose region- and action-aware virtual world
                 clients. To develop such clients, we present a
                 parameterized network traffic model, based on a large
                 collection of Second Life traces gathered by us. Our
                 methodology is also applicable to virtual worlds other
                 than Second Life. With the traffic model, various
                 optimization criteria can be adopted, including visual
                 quality, response time, and energy consumption. We use
                 energy consumption as the show case, and demonstrate
                 via trace-driven simulations that, compared to two
                 existing schemes, a mobile client can save up to 36\%
                 and 41\% communication energy by selectively turning on
                 its WiFi network interface.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Khodabakhshi:2013:SSF,
  author =       "Naghmeh Khodabakhshi and Mohamed Hefeeda",
  title =        "{Spider}: a system for finding {$3$D} video copies",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422963",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel content-based copy
                 detection system for 3D videos. The system creates
                 compact and robust depth and visual signatures from the
                 3D videos. Then, signature of a query video is compared
                 against an indexed database of reference videos'
                 signatures. The system returns a score, using both
                 spatial and temporal characteristics of videos,
                 indicating whether the query video matches any video in
                 the reference video database, and in case of matching,
                 which portion of the reference video matches the query
                 video. Analysis shows that the system is efficient,
                 both computationally and storage-wise. The system can
                 be used, for example, by video content owners, video
                 hosting sites, and third-party companies to find
                 illegally copied 3D videos. We implemented Spider, a
                 complete realization of the proposed system, and
                 conducted rigorous experiments on it. Our experimental
                 results show that the proposed system can achieve high
                 accuracy in terms of precision and recall even if the
                 3D videos are subjected to several transformations at
                 the same time. For example, the proposed system yields
                 100\% precision and recall when copied videos are parts
                 of original videos, and more than 90\% precision and
                 recall when copied videos are subjected to different
                 individual transformations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Abrams:2013:WAG,
  author =       "Austin Abrams and Robert Pless",
  title =        "{Web}-accessible geographic integration and
                 calibration of webcams",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422956.2422964",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun May 5 09:14:22 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A global network of webcams offers unique viewpoints
                 from tens of thousands of locations. Understanding the
                 geographic context of this imagery is vital in using
                 these cameras for quantitative environmental monitoring
                 or surveillance applications. We derive robust
                 geo-calibration constraints that allow users to
                 geo-register static or pan-tilt-zoom cameras by
                 specifying a few corresponding points, and describe our
                 Web interface suitable for novices. We discuss design
                 decisions that support our scalable, publicly
                 accessible Web service that allows webcam textures to
                 be displayed live on 3D geographic models. Finally, we
                 demonstrate several multimedia applications for
                 geo-calibrated cameras.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2013:EN,
  author =       "Ralf Steinmetz",
  title =        "Editorial note",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2523001.2523002",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Nahrstedt:2013:ISS,
  author =       "Klara Nahrstedt and Rainer Lienhart and Malcolm
                 Slaney",
  title =        "Introduction to the special section on the 20th
                 anniversary of the {ACM International Conference on
                 Multimedia}",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2523001.2523003",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2013:TDI,
  author =       "Baochun Li and Zhi Wang and Jiangchuan Liu and Wenwu
                 Zhu",
  title =        "Two decades of {Internet} video streaming: a
                 retrospective view",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "33:1--33:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2505805",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For over two decades, video streaming over the
                 Internet has received a substantial amount of attention
                 from both academia and industry. Starting from the
                 design of transport protocols for streaming video,
                 research interests have later shifted to the
                 peer-to-peer paradigm of designing streaming protocols
                 at the application layer. More recent research has
                 focused on building more practical and scalable
                 systems, using Dynamic Adaptive Streaming over HTTP. In
                 this article, we provide a retrospective view of the
                 research results over the past two decades, with a
                 focus on peer-to-peer streaming protocols and the
                 effects of cloud computing and social media.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2013:ETM,
  author =       "Zixia Huang and Klara Nahrstedt and Ralf Steinmetz",
  title =        "Evolution of temporal multimedia synchronization
                 principles: a historical viewpoint",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "34:1--34:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490821",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The evolution of multimedia applications has
                 drastically changed human life and behaviors. New
                 communication technologies lead to new requirements for
                 multimedia synchronization. This article presents a
                 historical view of temporal synchronization studies
                 focusing on continuous multimedia. We demonstrate how
                 the development of multimedia systems has created new
                 challenges for synchronization technologies. We
                 conclude with a new application-dependent,
                 multilocation, multirequirement synchronization
                 framework to address these new challenges.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bulterman:2013:SAM,
  author =       "Dick C. A. Bulterman and Pablo Cesar and Rodrigo
                 Laiola Guimar{\~a}es",
  title =        "Socially-aware multimedia authoring: {Past}, present,
                 and future",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "35:1--35:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491893",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Creating compelling multimedia productions is a
                 nontrivial task. This is as true for creating
                 professional content as it is for nonprofessional
                 editors. During the past 20 years, authoring networked
                 content has been a part of the research agenda of the
                 multimedia community. Unfortunately, authoring has been
                 seen as an initial enterprise that occurs before `real'
                 content processing takes place. This limits the options
                 open to authors and to viewers of rich multimedia
                 content for creating and receiving focused, highly
                 personal media presentations. This article reflects on
                 the history of multimedia authoring. We focus on the
                 particular task of supporting socially-aware
                 multimedia, in which the relationships within
                 particular social groups among authors and viewers can
                 be exploited to create highly personal media
                 experiences. We provide an overview of the requirements
                 and characteristics of socially-aware multimedia
                 authoring within the context of exploiting community
                 content. We continue with a short historical
                 perspective on authoring support for these types of
                 situations. We then present an overview of a current
                 system for supporting socially-aware multimedia
                 authoring within the community content. We conclude
                 with a discussion of the issues that we feel can
                 provide a fruitful basis for future multimedia
                 authoring support. We argue that providing support for
                 socially-aware multimedia authoring can have a profound
                 impact on the nature and architecture of the entire
                 multimedia information processing pipeline.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2013:IST,
  author =       "Lei Zhang and Yong Rui",
  title =        "Image search-from thousands to billions in 20 years",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "36:1--36:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490823",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a comprehensive review and
                 analysis on image search in the past 20 years,
                 emphasizing the challenges and opportunities brought by
                 the astonishing increase of dataset scales from
                 thousands to billions in the same time period, which
                 was witnessed first-hand by the authors as active
                 participants in this research area. Starting with a
                 retrospective review of three stages of image search in
                 the history, the article highlights major breakthroughs
                 around the year 2000 in image search features, indexing
                 methods, and commercial systems, which marked the
                 transition from stage two to stage three. Subsequent
                 sections describe the image search research from four
                 important aspects: system framework, feature extraction
                 and image representation, indexing, and big data's
                 potential. Based on the review, the concluding section
                 discusses open research challenges and suggests future
                 research directions in effective visual representation,
                 image knowledge base construction, implicit user
                 feedback and crowdsourcing, mobile image search, and
                 creative multimedia interfaces.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rowe:2013:LFY,
  author =       "Lawrence A. Rowe",
  title =        "Looking forward 10 years to multimedia successes",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "37:1--37:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A panel at ACM Multimedia 2012 addressed research
                 successes in the past 20 years. While the panel focused
                 on the past, this article discusses successes since the
                 ACM SIGMM 2003 Retreat and suggests research directions
                 in the next ten years. While significant progress has
                 been made, more research is required to allow
                 multimedia to impact our everyday computing
                 environment. The importance of hardware changes on
                 future research directions is discussed. We believe
                 ubiquitous computing-meaning abundant computation and
                 network bandwidth-should be applied in novel ways to
                 solve multimedia grand challenges and continue the IT
                 revolution of the past century.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shenoy:2013:MSR,
  author =       "Prashant Shenoy",
  title =        "Multimedia systems research: {The} first twenty years
                 and lessons for the next twenty",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "38:1--38:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490859",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This retrospective article examines the past two
                 decades of multimedia systems research through the lens
                 of three research topics that were in vogue in the
                 early days of the field and offers perspectives on the
                 evolution of these research topics. We discuss the
                 eventual impact of each line of research and offer
                 lessons for future research in the field.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hua:2013:OVD,
  author =       "Kien A. Hua",
  title =        "Online video delivery: {Past}, present, and future",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "39:1--39:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502435",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video streaming is the core technology for online
                 video delivery systems. Initial research on this
                 technology faced many challenges. In this article,
                 lessons learned from beginning trials are discussed;
                 some pioneering works that provided early solutions and
                 inspired subsequent research are presented; and new
                 techniques required for emerging applications are
                 examined.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Swaminathan:2013:WMV,
  author =       "Viswanathan Swaminathan",
  title =        "Are we in the middle of a video streaming
                 revolution?",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "40:1--40:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490826",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "It has been roughly 20 years since the beginning of
                 video streaming over the Internet. Until very recently,
                 video streaming experiences left much to be desired.
                 Over the last few years, this has significantly
                 improved making monetization of streaming, possible.
                 Recently, there has been an explosion of commercial
                 video delivery services over the Internet, sometimes
                 referred to as over-the-top (OTT) delivery. All these
                 services invariably use streaming technologies.
                 Initially, streaming had all the promise, then for a
                 long time, it was download and play, later progressive
                 download for short content, and now it is streaming
                 again. Did streaming win the download versus streaming
                 contest? Did the best technology win? The improvement
                 in streaming experience has been possible through a
                 variety of new streaming technologies, some proprietary
                 and others extensions to standard protocols. The
                 primary delivery mechanism for entertainment video,
                 both premium content like movies and user generated
                 content (UGC), tends to be HTTP streaming. Is HTTP
                 streaming the panacea for all problems? The goal of
                 this article is to give an industry perspective of what
                 fundamentally changed in video streaming that makes it
                 commercially viable now. This article outlines how a
                 blend of technology choices between download and
                 streaming makes the current wave of ubiquitous
                 streaming possible for entertainment video delivery.
                 After identifying problems that still need to be
                 solved, the article concludes with the lessons learnt
                 from the video streaming evolution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chou:2013:AIC,
  author =       "Philip A. Chou",
  title =        "Advances in immersive communication: (1) {Telephone},
                 (2) {Television}, (3) {Teleportation}",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "41:1--41:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492704",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The last great advances in immersive communication
                 were the invention of the telephone over 137 years ago
                 and the invention of the video telephone (n{\'e}
                 television) over 86 years ago. However, a perfect storm
                 is brewing for the next advance in immersive
                 communication, thanks to the convergence of massive
                 amounts of computation, bandwidth, resolution, new
                 sensors, and new displays. It could well be the
                 Multimedia community that turns this brew into the next
                 great advance in immersive communication, something
                 akin to teleportation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chang:2013:HFW,
  author =       "Shih-Fu Chang",
  title =        "How far we've come: {Impact} of 20 years of multimedia
                 information retrieval",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "42:1--42:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491844",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article reviews the major research trends that
                 emerged in the last two decades within the broad area
                 of multimedia information retrieval, with a focus on
                 the ACM Multimedia community. Trends are defined
                 (nonscientifically) to be topics that appeared in ACM
                 multimedia publications and have had a significant
                 number of citations. The article also assesses the
                 impacts of these trends on real-world applications. The
                 views expressed are subjective and likely biased but
                 hopefully useful for understanding the heritage of the
                 community and stimulating new research direction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Effelsberg:2013:PLB,
  author =       "Wolfgang Effelsberg",
  title =        "A personal look back at twenty years of research in
                 multimedia content analysis",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "43:1--43:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502434",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This paper is a personal look back at twenty years of
                 research in multimedia content analysis. It addresses
                 the areas of audio, photo and video analysis for the
                 purpose of indexing and retrieval from the perspective
                 of a multimedia researcher. Whereas a general analysis
                 of content is impossible due to the personal bias of
                 the user, significant progress was made in the
                 recognition of specific objects or events. The paper
                 concludes with a brief outlook on the future.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hanjalic:2013:MRM,
  author =       "Alan Hanjalic",
  title =        "Multimedia retrieval that matters",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "44:1--44:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490827",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article emphasizes the need to refocus multimedia
                 information retrieval (MIR) research towards bridging
                 the utility gap, the gap between the expected and
                 defacto usefulness of MIR solutions. This requires us
                 to revisit the notion of relevance, but also to
                 consider other criteria for assessing MIR solutions,
                 like the informativeness of the retrieved results and
                 how helpful they are for the users. The article also
                 states that this focus shift cannot be realized
                 incrementally, but by revisiting the foundations of MIR
                 solutions, that is, by a utility-by-design approach. In
                 this respect, a number of research challenges are
                 proposed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Turk:2013:TYE,
  author =       "Matthew Turk",
  title =        "Over twenty years of eigenfaces",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "45:1--45:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490824",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The inaugural ACM Multimedia Conference coincided with
                 a surge of interest in computer vision technologies for
                 detecting and recognizing people and their activities
                 in images and video. Face recognition was the first of
                 these topics to broadly engage the vision and
                 multimedia research communities. The Eigenfaces
                 approach was, deservedly or not, the method that
                 captured much of the initial attention, and it
                 continues to be taught and used as a benchmark over 20
                 years later. This article is a brief personal view of
                 the genesis of Eigenfaces for face recognition and its
                 relevance to the multimedia community.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Whitman:2013:CSF,
  author =       "Brian Whitman",
  title =        "Care and scale: {Fifteen} years of music retrieval",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "46:1--46:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492703",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The co-founder of The Echo Nest, a music intelligence
                 company that now powers recommendation and discovery
                 for most music services, discusses the notion of care
                 and scale, cultural analysis of music, a brief history
                 of music retrieval, and how and why The Echo Nest got
                 started.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Szeliski:2013:NWC,
  author =       "Richard Szeliski and Noah Snavely and Steven M.
                 Seitz",
  title =        "Navigating the worldwide community of photos",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "47:1--47:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492208",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The last decade has seen an explosion in the number of
                 photographs available on the Internet. The sheer volume
                 of interesting photos makes it a challenge to explore
                 this space. Various Web and social media sites, along
                 with search and indexing techniques, have been
                 developed in response. One natural way to navigate
                 these images in a 3D geo-located context. In this
                 article, we reflect on our work in this area, with a
                 focus on techniques that build partial 3D scene models
                 to help find and navigate interesting photographs in an
                 interactive, immersive 3D setting. We also discuss how
                 finding such relationships among photographs opens up
                 exciting new possibilities for multimedia authoring,
                 visualization, and editing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Andre:2013:EUU,
  author =       "Elisabeth Andre",
  title =        "Exploiting unconscious user signals in multimodal
                 human-computer interaction",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "48:1--48:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502433",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents the idea of empathic stimulation
                 that relies on the power and potential of unconsciously
                 conveyed attentive and emotional information to
                 facilitate human-machine interaction. Starting from a
                 historical review of related work presented at past ACM
                 Multimedia conferences, we discuss challenges that
                 arise when exploiting unconscious human signals for
                 empathic stimulation, such as the real-time analysis of
                 psychological user states and the smooth adaptation of
                 the human-machine interface based on this analysis. A
                 classical application field that might benefit from the
                 idea of unconscious human-computer interaction is the
                 exploration of massive datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sundaram:2013:EMS,
  author =       "Hari Sundaram",
  title =        "Experiential media systems",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "49:1--49:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502432",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a personalized narrative on the
                 early discussions within the Multimedia community and
                 the subsequent research on experiential media systems.
                 I discuss two different research initiatives-design of
                 real-time, immersive multimedia feedback environments
                 for stroke rehabilitation; exploratory environments for
                 events that exploited the user's ability to make
                 connections. I discuss the issue of foundations: the
                 question of multisensory integration and
                 superadditivity; the need for identification of
                 ``first-class'' Multimedia problems; expanding the
                 scope of Multimedia research.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kompatsiaris:2013:ISS,
  author =       "Ioannis (Yiannis) Kompatsiaris and Wenjun (Kevin) Zeng
                 and Gang Hua and Liangliang Cao",
  title =        "Introduction to the special section of best papers of
                 {ACM} multimedia 2012",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "50:1--50:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2523001.2523004",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2013:RAM,
  author =       "Heng Liu and Tao Mei and Houqiang Li and Jiebo Luo and
                 Shipeng Li",
  title =        "Robust and accurate mobile visual localization and its
                 applications",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "51:1--51:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491735",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mobile applications are becoming increasingly popular.
                 More and more people are using their phones to enjoy
                 ubiquitous location-based services (LBS). The
                 increasing popularity of LBS creates a fundamental
                 problem: mobile localization. Besides traditional
                 localization methods that use GPS or wireless signals,
                 using phone-captured images for localization has drawn
                 significant interest from researchers. Photos contain
                 more scene context information than the embedded
                 sensors, leading to a more precise location
                 description. With the goal being to accurately sense
                 real geographic scene contexts, this article presents a
                 novel approach to mobile visual localization according
                 to a given image (typically associated with a rough GPS
                 position). The proposed approach is capable of
                 providing a complete set of more accurate parameters
                 about the scene geo-context including the real
                 locations of both the mobile user and perhaps more
                 importantly the captured scene, as well as the viewing
                 direction. To figure out how to make image localization
                 quick and accurate, we investigate various techniques
                 for large-scale image retrieval and 2D-to-3D matching.
                 Specifically, we first generate scene clusters using
                 joint geo-visual clustering, with each scene being
                 represented by a reconstructed 3D model from a set of
                 images. The 3D models are then indexed using a visual
                 vocabulary tree structure. Taking geo-tags of the
                 database image as prior knowledge, a novel
                 location-based codebook weighting scheme proposed to
                 embed this additional information into the codebook.
                 The discriminative power of the codebook is enhanced,
                 thus leading to better image retrieval performance. The
                 query image is aligned with the models obtained from
                 the image retrieval results, and eventually registered
                 to a real-world map. We evaluate the effectiveness of
                 our approach using several large-scale datasets and
                 achieving estimation accuracy of a user's location
                 within 13 meters, viewing direction within 12 degrees,
                 and viewing distance within 26 meters. Of particular
                 note is our showcase of three novel applications based
                 on localization results: (1) an on-the-spot tour guide,
                 (2) collaborative routing, and (3) a sight-seeing
                 guide. The evaluations through user studies demonstrate
                 that these applications are effective in facilitating
                 the ideal rendezvous for mobile users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2013:PBS,
  author =       "Zhi Wang and Wenwu Zhu and Xiangwen Chen and Lifeng
                 Sun and Jiangchuan Liu and Minghua Chen and Peng Cui
                 and Shiqiang Yang",
  title =        "Propagation-based social-aware multimedia content
                 distribution",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "52:1--52:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2523001.2523005",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online social networks have reshaped how multimedia
                 contents are generated, distributed, and consumed on
                 today's Internet. Given the massive number of
                 user-generated contents shared in online social
                 networks, users are moving to directly access these
                 contents in their preferred social network services. It
                 is intriguing to study the service provision of social
                 contents for global users with satisfactory quality of
                 experience. In this article, we conduct large-scale
                 measurement of a real-world online social network
                 system to study the social content propagation. We have
                 observed important propagation patterns, including
                 social locality, geographical locality, and temporal
                 locality. Motivated by the measurement insights, we
                 propose a propagation-based social-aware delivery
                 framework using a hybrid edge-cloud and peer-assisted
                 architecture. We also design replication strategies for
                 the architecture based on three propagation predictors
                 designed by jointly considering user, content, and
                 context information. In particular, we design a
                 propagation region predictor and a global audience
                 predictor to guide how the edge-cloud servers backup
                 the contents, and a local audience predictor to guide
                 how peers cache the contents for their friends. Our
                 trace-driven experiments further demonstrate the
                 effectiveness and superiority of our design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sang:2013:SIA,
  author =       "Jitao Sang and Changsheng Xu",
  title =        "Social influence analysis and application on
                 multimedia sharing websites",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "1s",
  pages =        "53:1--53:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502436",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:45 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Social media is becoming popular these days, where
                 users necessarily interact with each other to form
                 social networks. Influence network, as one special case
                 of social network, has been recognized as significantly
                 impacting social activities and user decisions. We
                 emphasize in this article that the inter-user influence
                 is essentially topic-sensitive, as for different tasks
                 users tend to trust different influencers and be
                 influenced most by them. While existing research
                 focuses on global influence modeling and applies to
                 text-based networks, this work investigates the problem
                 of topic-sensitive influence modeling in the multimedia
                 domain. According to temporal data justification, we
                 propose a multimodal probabilistic model, considering
                 both users' textual annotation and uploaded visual
                 images. This model is capable of simultaneously
                 extracting user topic distributions and topic-sensitive
                 influence strengths. By identifying the topic-sensitive
                 influencer, we are able to conduct applications, like
                 collective search and collaborative recommendation. A
                 risk minimization-based general framework for
                 personalized image search is further presented, where
                 the image search task is transferred to measure the
                 distance of image and personalized query language
                 models. The framework considers the noisy tag issue and
                 enables easy incorporation of social influence. We have
                 conducted experiments on a large-scale Flickr dataset.
                 Qualitative as well as quantitative evaluation results
                 have validated the effectiveness of the topic-sensitive
                 influencer mining model, and demonstrated the advantage
                 of incorporating topic-sensitive influence in
                 personalized image search and topic-based image
                 recommendation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Silva:2013:HPH,
  author =       "Juan M. Silva and Mauricio Orozco and Jongeun Cha and
                 Abdulmotaleb {El Saddik} and Emil M. Petriu",
  title =        "Human perception of haptic-to-video and
                 haptic-to-audio skew in multimedia applications",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457451",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The purpose of this research is to assess the
                 sensitivity of humans to perceive asynchrony among
                 media signals coming from a computer application.
                 Particularly we examine haptic-to-video and
                 haptic-to-audio skew. For this purpose we have designed
                 an experimental setup, where users are exposed to a
                 basic multimedia presentation resembling a ping-pong
                 game. For every collision between a ball and a racket,
                 the user is able to perceive auditory, visual, and
                 haptic cues about the collision event. We artificially
                 introduce negative and positive delay to the auditory
                 and visual cues with respect to the haptic stream. We
                 subjectively evaluate the perception of inter-stream
                 asynchrony perceived by the users using two types of
                 haptic devices. The statistical results of our
                 evaluation show perception rates of around 100 ms
                 regardless of modality and type of device.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bhatt:2013:RPB,
  author =       "Chidansh A. Bhatt and Pradeep K. Atrey and Mohan S.
                 Kankanhalli",
  title =        "A reward-and-punishment-based approach for concept
                 detection using adaptive ontology rules",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457452",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Despite the fact that performance improvements have
                 been reported in the last years, semantic concept
                 detection in video remains a challenging problem.
                 Existing concept detection techniques, with ontology
                 rules, exploit the static correlations among primitive
                 concepts but not the dynamic spatiotemporal
                 correlations. The proposed method rewards (or punishes)
                 detected primitive concepts using dynamic
                 spatiotemporal correlations of the given ontology rules
                 and updates these ontology rules based on the accuracy
                 of detection. Adaptively learned ontology rules
                 significantly help in improving the overall accuracy of
                 concept detection as shown in the experimental
                 result.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Alsulaiman:2013:IVB,
  author =       "Fawaz A. Alsulaiman and Nizar Sakr and Julio J.
                 Vald{\'e}s and Abdulmotaleb {El Saddik}",
  title =        "Identity verification based on handwritten signatures
                 with haptic information using genetic programming",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457453",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, haptic-based handwritten signature
                 verification using Genetic Programming (GP)
                 classification is presented. A comparison of GP-based
                 classification with classical classifiers including
                 support vector machine, $k$-nearest neighbors,
                 na{\"\i}ve Bayes, and random forest is conducted. In
                 addition, the use of GP in discovering small
                 knowledge-preserving subsets of features in
                 high-dimensional datasets of haptic-based signatures is
                 investigated and several approaches are explored.
                 Subsets of features extracted from GP-generated models
                 (analytic functions) are also exploited to determine
                 the importance and relevance of different haptic data
                 types (e.g., force, position, torque, and orientation)
                 in user identity verification. The results revealed
                 that GP classifiers compare favorably with the
                 classical methods and use a much fewer number of
                 attributes (with simple function sets).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2013:MAS,
  author =       "Qianni Zhang and Ebroul Izquierdo",
  title =        "Multifeature analysis and semantic context learning
                 for image classification",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457454",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article introduces an image classification
                 approach in which the semantic context of images and
                 multiple low-level visual features are jointly
                 exploited. The context consists of a set of semantic
                 terms defining the classes to be associated to
                 unclassified images. Initially, a multiobjective
                 optimization technique is used to define a multifeature
                 fusion model for each semantic class. Then, a Bayesian
                 learning procedure is applied to derive a context model
                 representing relationships among semantic classes.
                 Finally, this context model is used to infer object
                 classes within images. Selected results from a
                 comprehensive experimental evaluation are reported to
                 show the effectiveness of the proposed approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2013:MEU,
  author =       "Zhen Wei Zhao and Sameer Samarth and Wei Tsang Ooi",
  title =        "Modeling the effect of user interactions on mesh-based
                 {P2P VoD} streaming systems",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457455",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "User interactions such as seeks and pauses are widely
                 supported by existing Peer-to-Peer Video-on-Demand (P2P
                 VoD) streaming systems. Their effect on the streaming
                 system, however, has not been well studied. Seeks cause
                 peers to skip part of the video, making them stay in
                 the system for shorter time, and thus contribute less.
                 On the other hand, only part of the video is downloaded
                 due to seeks, reducing peers' demand from the system.
                 It is unclear which factor dominates the effect of
                 seeks on the streaming system. Pauses during playback,
                 on one hand, allow peers to stay longer in the system
                 and upload more content. When interleaved with seeks,
                 however, long pauses may increase peers' demand
                 unnecessarily as peers may download content that will
                 eventually be skipped by subsequent forward seeks. The
                 collective effect of seeks and pauses, together with
                 the known random peer departure, is unintuitive and
                 needs to be addressed properly so as to understand the
                 effect of human factors on the streaming system
                 performance. In this article, we develop an analytical
                 model to both qualitatively and quantitatively study
                 the effect of seeks and pauses on mesh-based P2P VoD
                 streaming systems, in particular, the effect on the
                 server cost. Our model can help in understanding how
                 human factors such as seeks and pauses affect the
                 streaming system performance, tuning a P2P VoD system
                 towards better system performance and stability, and
                 providing a framework for capacity planning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2013:ETT,
  author =       "Yang Yang and Yi Yang and Heng Tao Shen",
  title =        "Effective transfer tagging from image to video",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457456",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Recent years have witnessed a great explosion of
                 user-generated videos on the Web. In order to achieve
                 an effective and efficient video search, it is critical
                 for modern video search engines to associate videos
                 with semantic keywords automatically. Most of the
                 existing video tagging methods can hardly achieve
                 reliable performance due to deficiency of training
                 data. It is noticed that abundant well-tagged data are
                 available in other relevant types of media (e.g.,
                 images). In this article, we propose a novel video
                 tagging framework, termed as Cross-Media Tag Transfer
                 (CMTT), which utilizes the abundance of well-tagged
                 images to facilitate video tagging. Specifically, we
                 build a ``cross-media tunnel'' to transfer knowledge
                 from images to videos. To this end, an optimal kernel
                 space, in which distribution distance between images
                 and video is minimized, is found to tackle the
                 domain-shift problem. A novel cross-media video tagging
                 model is proposed to infer tags by exploring the
                 intrinsic local structures of both labeled and
                 unlabeled data, and learn reliable video classifiers.
                 An efficient algorithm is designed to optimize the
                 proposed model in an iterative and alternative way.
                 Extensive experiments illustrate the superiority of our
                 proposal compared to the state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2013:AAP,
  author =       "Zhen Wei Zhao and Wei Tsang Ooi",
  title =        "{APRICOD}: an access-pattern-driven distributed
                 caching middleware for fast content discovery of
                 noncontinuous media access",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457450.2457457",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:48 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Content discovery is a major source of latency in
                 peer-to-peer (P2P) media streaming systems, especially
                 in the presence of noncontinuous user access, such as
                 random seek in Video-on-Demand (VoD) streaming and
                 teleportation in a Networked Virtual Environment (NVE).
                 After the aforementioned user interactions, streaming
                 systems often need to initiate the content discovery
                 process to identify where to retrieve the requested
                 media objects. Short content lookup latency is demanded
                 to ensure smooth user experience. Existing content
                 discovery systems based on either a Distributed Hash
                 Table (DHT) or gossip mechanism cannot cope with
                 noncontinuous access efficiently due to their long
                 lookup latency. In this work, we propose an
                 access-pattern-driven distributed caching middleware
                 named APRICOD, which caters for fast and scalable
                 content discovery in peer-to-peer media streaming
                 systems, especially when user interactions are present.
                 APRICOD exploits correlations among media objects
                 accessed by users, and adapts to shift in the user
                 access pattern automatically. We first present a
                 general APRICOD design that can be used with any
                 existing content discovery system. We then present an
                 implementation of APRICOD on top of Pastry, which we
                 use to evaluate APRICOD. Our evaluation in a 1024-node
                 system, using a Second Life trace with 5,735 users and
                 a VoD trace with 54 users, shows that APRICOD can
                 effectively resolve all continuous access queries with
                 a single hop deterministically with node failure as an
                 exception, and resolve noncontinuous access queries
                 with a single hop with high probability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Anonymous:2013:CPM,
  author =       "Anonymous",
  title =        "Call for papers: {Multiple} sensorial {(MulSeMedia)}
                 multi-modal media: {Advances} and applications",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2500818",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mei:2013:NLS,
  author =       "Tao Mei and Lin-Xie Tang and Jinhui Tang and
                 Xian-Sheng Hua",
  title =        "Near-lossless semantic video summarization and its
                 applications to video analysis",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487269",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The ever increasing volume of video content on the Web
                 has created profound challenges for developing
                 efficient indexing and search techniques to manage
                 video data. Conventional techniques such as video
                 compression and summarization strive for the two
                 commonly conflicting goals of low storage and high
                 visual and semantic fidelity. With the goal of
                 balancing both video compression and summarization,
                 this article presents a novel approach, called
                 Near-Lossless Semantic Summarization (NLSS), to
                 summarize a video stream with the least high-level
                 semantic information loss by using an extremely small
                 piece of metadata. The summary consists of compressed
                 image and audio streams, as well as the metadata for
                 temporal structure and motion information. Although at
                 a very low compression rate (around $ 1 / 4 $0; of
                 H.264 baseline, where traditional compression
                 techniques can hardly preserve an acceptable visual
                 fidelity), the proposed NLSS still can be applied to
                 many video-oriented tasks, such as visualization,
                 indexing and browsing, duplicate detection, concept
                 detection, and so on. We evaluate the NLSS on TRECVID
                 and other video collections, and demonstrate that it is
                 a powerful tool for significantly reducing storage
                 consumption, while keeping high-level semantic
                 fidelity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ademoye:2013:IRT,
  author =       "Oluwakemi A. Ademoye and Gheorghita Ghinea",
  title =        "Information recall task impact in olfaction-enhanced
                 multimedia",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487270",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Enhancing multimedia applications with olfactory
                 sensations is one of the last challenges in the area.
                 While there is evidence, both scientific and anecdotal,
                 that olfactory cues help users in information recall
                 tasks, there is a lack of work when the targeted
                 information is one contained in a multimedia
                 presentation, which is precisely the focus of this
                 article. Accordingly, we present the results of two
                 experimental studies. The first study measured the
                 impact of olfactory media variation on the user's
                 ability to perceive, synthesize, and analyze the
                 informational content of olfactory-enhanced multimedia
                 videos; the second study measured the impact of
                 information content, and an information recall task in
                 respect of user perception of the relevance, sense of
                 reality, and acceptability of the olfactory media
                 content, as well as the overall enjoyment of the
                 experience. Results show that the use of olfactory
                 media content, both pleasant and unpleasant, in
                 multimedia displays does not significantly impact on
                 information assimilation in a negative way. Moreover,
                 the addition of a performance task may enhance the
                 user's understanding of the correlation between the
                 characteristic odor(s) and the scenario under
                 consideration, as well as enable users to consciously
                 learn the odors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yeh:2013:CAS,
  author =       "Lo-Yao Yeh and Jiun-Long Huang",
  title =        "A conditional access system with efficient key
                 distribution and revocation for mobile pay-{TV}
                 systems",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487271",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Current mobile pay-TV systems have two types of
                 Conditional Access Systems (CAS): group-key-based and
                 public-key systems. The best feature of group-key-based
                 systems is the ability to enjoy the broadcast nature in
                 delivery multimedia contents, while the major advantage
                 of public-key systems is consolidating the security
                 foundation to withstand various attacks, such as
                 collusion attacks. However, the problems of
                 group-key-based systems include collusion attacks, lack
                 of nonrepudiation, and troublesome key distribution.
                 Even worse, the benefit of broadcast efficiency is
                 confined to a group size of no more than 512
                 subscribers. For public-key systems, the poor delivery
                 scalability is the major shortcoming because the unique
                 private key feature is only suitable for one-to-one
                 delivery. In this article, we introduce a scalable
                 access control scheme to integrate the merits of
                 broadcasting regardless of group size and sound
                 security assurance, including fine-grained access
                 control and collusion attack resistance. For subscriber
                 revocation, a single message is broadcast to the other
                 subscribers to get the updated key, thus significantly
                 boosting subscriber revocation scalability. Due to
                 mobile subscribers' dynamic movements, this article
                 also analyzes the benefit of retransmission cases in
                 our system. Through the performance evaluation and
                 functionality comparison, the proposed scheme should be
                 a decent candidate to enhance the security strength and
                 transmission efficiency in a mobile pay-TV system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Naskar:2013:GTL,
  author =       "Ruchira Naskar and Rajat Subhra Chakraborty",
  title =        "A generalized tamper localization approach for
                 reversible watermarking algorithms",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487272",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In general reversible watermarking algorithms, the
                 convention is to reject the entire cover image at the
                 receiver end if it fails authentication, since there is
                 no way to detect the exact locations of tampering. This
                 feature may be exploited by an adversary to bring about
                 a form of DoS attack. Here we provide a solution to
                 this problem in form of a tamper localization mechanism
                 for reversible watermarking algorithms, which allows
                 selective rejection of distorted cover image regions in
                 case of authentication failure, thus avoiding rejection
                 of the complete image. Additionally it minimizes the
                 bandwidth requirement of the communication channel.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Doherty:2013:SSA,
  author =       "Jonathan Doherty and Kevin Curran and Paul Mckevitt",
  title =        "A self-similarity approach to repairing large dropouts
                 of streamed music",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "20:1--20:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487273",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Enjoyment of audio has now become about flexibility
                 and personal freedom. Digital audio content can be
                 acquired from many sources and wireless networking
                 allows digital media devices and associated peripherals
                 to be unencumbered by wires. However, despite recent
                 improvements in capacity and quality of service,
                 wireless networks are inherently unreliable
                 communications channels for the streaming of audio,
                 being susceptible to the effects of range,
                 interference, and occlusion. This time-varying
                 reliability of wireless audio transfer introduces data
                 corruption and loss, with unpleasant audible effects
                 that can be profound and prolonged in duration.
                 Traditional communications techniques for error
                 mitigation perform poorly and in a bandwidth
                 inefficient manner in the presence of such large-scale
                 defects in a digital audio stream. A novel solution
                 that can complement existing techniques takes account
                 of the semantics and natural repetition of music.
                 Through the use of self-similarity metadata, missing or
                 damaged audio segments can be seamlessly replaced with
                 similar undamaged segments that have already been
                 successfully received. We propose a technology to
                 generate relevant self-similarity metadata for
                 arbitrary audio material and to utilize this metadata
                 within a wireless audio receiver to provide
                 sophisticated and real-time correction of large-scale
                 errors. The primary objectives are to match the current
                 section of a song being received with previous sections
                 while identifying incomplete sections and determining
                 replacements based on previously received portions of
                 the song. This article outlines our approach to Forward
                 Error Correction (FEC) technology that is used to
                 ``repair'' a bursty dropout when listening to
                 time-dependent media on a wireless network. Using
                 self-similarity analysis on a music file, we can
                 ``automatically'' repair the dropout with a similar
                 portion of the music already received thereby
                 minimizing a listener's discomfort.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ho:2013:IPC,
  author =       "Edmond S. L. Ho and Jacky C. P. Chan and Taku Komura
                 and Howard Leung",
  title =        "Interactive partner control in close interactions for
                 real-time applications",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487268.2487274",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:50 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a new framework for synthesizing
                 motion of a virtual character in response to the
                 actions performed by a user-controlled character in
                 real time. In particular, the proposed method can
                 handle scenes in which the characters are closely
                 interacting with each other such as those in partner
                 dancing and fighting. In such interactions,
                 coordinating the virtual characters with the human
                 player automatically is extremely difficult because the
                 system has to predict the intention of the player
                 character. In addition, the style variations from
                 different users affect the accuracy in recognizing the
                 movements of the player character when determining the
                 responses of the virtual character. To solve these
                 problems, our framework makes use of the spatial
                 relationship-based representation of the body parts
                 called interaction mesh, which has been proven
                 effective for motion adaptation. The method is
                 computationally efficient, enabling real-time character
                 control for interactive applications. We demonstrate
                 its effectiveness and versatility in synthesizing a
                 wide variety of motions with close interactions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2013:ER,
  author =       "Professor Dr.-Ing. Ralf Steinmetz",
  title =        "Editorial: Reviewers",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501644",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sakai:2013:PPC,
  author =       "Kazuya Sakai and Wei-Shinn Ku and Min-Te Sun and Roger
                 Zimmermann",
  title =        "Privacy preserving continuous multimedia streaming in
                 {MANETs}",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501645",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "At present, mobile devices are prevalent with end
                 users and continuous media streaming services in mobile
                 ad-hoc networks (MANETs) support popular applications.
                 It is required for applications that stream isochronous
                 media that the network link be continuously available.
                 In this study, we introduce two group-server scheduling
                 schemes to improve link continuity: static group-server
                 scheduling and dynamic group-server scheduling. With
                 our solution, if one of the current links between a
                 client and a server instance breaks, the client can
                 still download the multimedia content from another
                 scheduled server peer. In addition, we incorporate the
                 data link layer constraints as well as privacy concerns
                 into our protocol design. The simulation results show
                 that the proposed schemes significantly improve the
                 effective link duration, overall system performance,
                 and degree of privacy in MANETs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dong:2013:RIA,
  author =       "Jian Dong and Bin Cheng and Xiangyu Chen and Tat-Seng
                 Chua and Shuicheng Yan and Xi Zhou",
  title =        "Robust image annotation via simultaneous feature and
                 sample outlier pursuit",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501646",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Graph-based semi-supervised image annotation has
                 achieved great success in a variety of studies, yet it
                 essentially and intuitively suffers from both the
                 irrelevant/noisy features (referred to as feature
                 outliers) and the unusual/corrupted samples (referred
                 to as sample outliers). In this work, we investigate
                 how to derive robust sample affinity matrix via
                 simultaneous feature and sample outlier pursuit. This
                 task is formulated as a Dual-outlier and Prior-driven
                 Low-Rank Representation (DP-LRR) problem, which
                 possesses convexity in objective function. In DP-LRR,
                 the clean data are assumed to be self-reconstructible
                 with low-rank coefficient matrix as in LRR; while the
                 error matrix is decomposed as the sum of a row-wise
                 sparse matrix and a column-wise sparse matrix, the
                 l$_{2, 1}$ -norm minimization of which encourages the
                 pursuit of feature and sample outliers respectively.
                 The DP-LRR is further regularized by the priors from
                 side information, that is, the inhomogeneous data
                 pairs. An efficient iterative procedure based on
                 linearized alternating direction method is presented to
                 solve the DP-LRR problem, with closed-form solutions
                 within each iteration. The derived low-rank
                 reconstruction coefficient matrix is then fed into any
                 graph based semi-supervised label propagation algorithm
                 for image annotation, and as a by-product, the cleaned
                 data from DP-LRR can also be utilized as a better image
                 representation to generally boost image annotation
                 performance. Extensive experiments on MIRFlickr,
                 Corel30K, NUS-WIDE-LITE and NUS-WIDE databases well
                 demonstrate the effectiveness of the proposed
                 formulation for robust image annotation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Villanueva:2013:HMB,
  author =       "Arantxa Villanueva and Victoria Ponz and Laura
                 Sesma-Sanchez and Mikel Ariz and Sonia Porta and Rafael
                 Cabeza",
  title =        "Hybrid method based on topography for robust detection
                 of iris center and eye corners",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501647",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A multistage procedure to detect eye features is
                 presented. Multiresolution and topographic
                 classification are used to detect the iris center. The
                 eye corner is calculated combining valley detection and
                 eyelid curve extraction. The algorithm is tested in the
                 BioID database and in a proprietary database containing
                 more than 1200 images. The results show that the
                 suggested algorithm is robust and accurate. Regarding
                 the iris center our method obtains the best average
                 behavior for the BioID database compared to other
                 available algorithms. Additional contributions are that
                 our algorithm functions in real time and does not
                 require complex post processing stages.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2013:ECR,
  author =       "Bo Wang and Jinqiao Wang and Hanqing Lu",
  title =        "Exploiting content relevance and social relevance for
                 personalized ad recommendation on {Internet TV}",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501648",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "There have been not many interactions between the two
                 dominant forms of mass communication: television and
                 the Internet, while nowadays the appearance of Internet
                 television makes them more closely. Different with
                 traditional TV in a passive mode of transmission,
                 Internet TV makes it more possible to make personalized
                 service recommendation because of the interactivity
                 between users and the Internet. In this article, we
                 introduce a scheme to provide targeted ad
                 recommendation to Internet TV users by exploiting the
                 content relevance and social relevance. First, we
                 annotate TV videos in terms of visual content analysis
                 and textual analysis by aligning visual and textual
                 information. Second, with user-user, video-video and
                 user-video relationships, we employ Multi-Relationship
                 based Probabilistic Matrix Factorization (MRPMF) to
                 learn representative tags for modeling user preference.
                 And then semantic content relevance (between product/ad
                 and TV video) and social relevance (between product/ad
                 and user interest) are calculated by projecting the
                 corresponding tags into our advertising concept space.
                 Finally, with relevancy scores we make ranking for
                 relevant product/ads to effectively provide users
                 personalized recommendation. The experimental results
                 demonstrate attractiveness and effectiveness of our
                 proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Alam:2013:MHB,
  author =       "Kazi Masudul Alam and Abu Saleh Md Mahfujur Rahman and
                 Abdulmotaleb {El Saddik}",
  title =        "Mobile haptic e-book system to support {$3$D}
                 immersive reading in ubiquitous environments",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501649",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In order to leverage the use of various modalities
                 such as audio-visual materials in instilling effective
                 learning behavior we present an intuitive approach of
                 annotation based hapto-audio-visual interaction with
                 the traditional digital learning materials such as
                 e-books. By integrating the home entertainment system
                 in the user's reading experience combined with haptic
                 interfaces we want to examine whether such augmentation
                 of modalities influence the user's learning patterns.
                 The proposed Haptic E--Book (HE-Book) system leverages
                 the haptic jacket, haptic arm band as well as haptic
                 sofa interfaces to receive haptic emotive signals
                 wirelessly in the form of patterned vibrations of the
                 actuators and expresses the learning material by
                 incorporating image, video, 3D environment based
                 augmented display in order to pave ways for intimate
                 reading experience in the popular mobile e-book
                 platform.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Nguyen:2013:TDA,
  author =       "Tam V. Nguyen and Si Liu and Bingbing Ni and Jun Tan
                 and Yong Rui and Shuicheng Yan",
  title =        "Towards decrypting attractiveness via multi-modality
                 cues",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501650",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Decrypting the secret of beauty or attractiveness has
                 been the pursuit of artists and philosophers for
                 centuries. To date, the computational model for
                 attractiveness estimation has been actively explored in
                 computer vision and multimedia community, yet with the
                 focus mainly on facial features. In this article, we
                 conduct a comprehensive study on female attractiveness
                 conveyed by single/multiple modalities of cues, that
                 is, face, dressing and/or voice, and aim to discover
                 how different modalities individually and collectively
                 affect the human sense of beauty. To extensively
                 investigate the problem, we collect the Multi-Modality
                 Beauty (M$^2$ B) dataset, which is annotated with
                 attractiveness levels converted from manual $k$-wise
                 ratings and semantic attributes of different
                 modalities. Inspired by the common consensus that
                 middle-level attribute prediction can assist
                 higher-level computer vision tasks, we manually labeled
                 many attributes for each modality. Next, a tri-layer
                 Dual-supervised Feature-Attribute-Task (DFAT) network
                 is proposed to jointly learn the attribute model and
                 attractiveness model of single/multiple modalities. To
                 remedy possible loss of information caused by
                 incomplete manual attributes, we also propose a novel
                 Latent Dual-supervised Feature-Attribute-Task (LDFAT)
                 network, where latent attributes are combined with
                 manual attributes to contribute to the final
                 attractiveness estimation. The extensive experimental
                 evaluations on the collected M$^2$ B dataset well
                 demonstrate the effectiveness of the proposed DFAT and
                 LDFAT networks for female attractiveness prediction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tang:2013:TOH,
  author =       "Jinhui Tang and Qiang Chen and Meng Wang and Shuicheng
                 Yan and Tat-Seng Chua and Ramesh Jain",
  title =        "Towards optimizing human labeling for interactive
                 image tagging",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501651",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Interactive tagging is an approach that combines human
                 and computer to assign descriptive keywords to image
                 contents in a semi-automatic way. It can avoid the
                 problems in automatic tagging and pure manual tagging
                 by achieving a compromise between tagging performance
                 and manual cost. However, conventional research efforts
                 on interactive tagging mainly focus on sample selection
                 and models for tag prediction. In this work, we
                 investigate interactive tagging from a different
                 aspect. We introduce an interactive image tagging
                 framework that can more fully make use of human's
                 labeling efforts. That means, it can achieve a
                 specified tagging performance by taking less manual
                 labeling effort or achieve better tagging performance
                 with a specified labeling cost. In the framework,
                 hashing is used to enable a quick clustering of image
                 regions and a dynamic multiscale clustering labeling
                 strategy is proposed such that users can label a large
                 group of similar regions each time. We also employ a
                 tag refinement method such that several inappropriate
                 tags can be automatically corrected. Experiments on a
                 large dataset demonstrate the effectiveness of our
                 approach",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Carbunar:2013:FNA,
  author =       "Bogdan Carbunar and Rahul Potharaju and Michael Pearce
                 and Venugopal Vasudevan and Michael Needham",
  title =        "A framework for network aware caching for video on
                 demand systems",
  journal =      j-TOMCCAP,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501643.2501652",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:51 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  note =         "See errata \cite{Carbunar:2014:EFN}.",
  abstract =     "Video on Demand (VoD) services allow users to select
                 and locally consume remotely stored content. We
                 investigate the use of caching to solve the scalability
                 issues of several existing VoD providers. We propose
                 metrics and goals that define the requirements of a
                 caching framework for CDNs of VoD systems. Using data
                 logs collected from Motorola equipment from Comcast VoD
                 deployments we show that several classic caching
                 solutions do not satisfy the proposed goals. We address
                 this issue by developing novel techniques for
                 predicting future values of several metrics of
                 interest. We rely on computed predictions to define the
                 penalty imposed on the system, both network and caching
                 sites, when not storing individual items. We use item
                 penalties to devise novel caching and static content
                 placement strategies. We use the previously mentioned
                 data logs to validate our solutions and show that they
                 satisfy all the defined goals.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2013:ENO,
  author =       "Zechao Li and Jing Liu and Meng Wang and Changsheng Xu
                 and Hanqing Lu",
  title =        "Enhancing news organization for convenient retrieval
                 and browsing",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2488732",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "To facilitate users to access news quickly and
                 comprehensively, we design a news search and browsing
                 system named GeoVisNews, in which the news elements of
                 ``Where'', ``Who'', ``What'' and ``When'' are enhanced
                 via news geo-localization, image enrichment and joint
                 ranking, respectively. For news geo-localization, an
                 Ordinal Correlation Consistent Matrix Factorization
                 (OCCMF) model is proposed to maintain the relevance
                 rankings of locations to a specific news document and
                 simultaneously capture intra-relations among locations
                 and documents. To visualize news, we develop a novel
                 method to enrich news documents with appropriate web
                 images. Specifically, multiple queries are first
                 generated from news documents for image search, and
                 then the appropriate images are selected from the
                 collected web images by an intelligent fusion approach
                 based on multiple features. Obtaining the geo-localized
                 and image enriched news resources, we further employ a
                 joint ranking strategy to provide relevant, timely and
                 popular news items as the answer of user searching
                 queries. Extensive experiments on a large-scale news
                 dataset collected from the web demonstrate the superior
                 performance of the proposed approaches over related
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Knees:2013:SMS,
  author =       "Peter Knees and Markus Schedl",
  title =        "A survey of music similarity and recommendation from
                 music context data",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2542205.2542206",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this survey article, we give an overview of methods
                 for music similarity estimation and music
                 recommendation based on music context data. Unlike
                 approaches that rely on music content and have been
                 researched for almost two decades, music-context -based
                 (or contextual ) approaches to music retrieval are a
                 quite recent field of research within music information
                 retrieval (MIR). Contextual data refers to all
                 music-relevant information that is not included in the
                 audio signal itself. In this article, we focus on
                 contextual aspects of music primarily accessible
                 through web technology. We discuss different sources of
                 context-based data for individual music pieces and for
                 music artists. We summarize various approaches for
                 constructing similarity measures based on the
                 collaborative or cultural knowledge incorporated into
                 these data sources. In particular, we identify and
                 review three main types of context-based similarity
                 approaches: text-retrieval-based approaches (relying on
                 web-texts, tags, or lyrics), co-occurrence-based
                 approaches (relying on playlists, page counts,
                 microblogs, or peer-to-peer-networks), and approaches
                 based on user ratings or listening habits. This article
                 elaborates the characteristics of the presented
                 context-based measures and discusses their strengths as
                 well as their weaknesses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2013:DPO,
  author =       "Yi-Liang Zhao and Qiang Chen and Shuicheng Yan and
                 Tat-Seng Chua and Daqing Zhang",
  title =        "Detecting profilable and overlapping communities with
                 user-generated multimedia contents in {LBSNs}",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2502415",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In location-based social networks (LBSNs), users
                 implicitly interact with each other by visiting places,
                 issuing comments and/or uploading photos. These
                 heterogeneous interactions convey the latent
                 information for identifying meaningful user groups,
                 namely social communities, which exhibit unique
                 location-oriented characteristics. In this work, we aim
                 to detect and profile social communities in LBSNs by
                 representing the heterogeneous interactions with a
                 multimodality nonuniform hypergraph. Here, the vertices
                 of the hypergraph are users, venues, textual comments
                 or photos and the hyperedges characterize the k
                 -partite heterogeneous interactions such as posting
                 certain comments or uploading certain photos while
                 visiting certain places. We then view each detected
                 social community as a dense subgraph within the
                 heterogeneous hypergraph, where the user community is
                 constructed by the vertices and edges in the dense
                 subgraph and the profile of the community is
                 characterized by the vertices related with venues,
                 comments and photos and their inter-relations. We
                 present an efficient algorithm to detect the overlapped
                 dense subgraphs, where the profile of each social
                 community is guaranteed to be available by constraining
                 the minimal number of vertices in each modality.
                 Extensive experiments on Foursquare data well validated
                 the effectiveness of the proposed framework in terms of
                 detecting meaningful social communities and uncovering
                 their underlying profiles in LBSNs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bhatnagar:2013:SRI,
  author =       "Gaurav Bhatnagar and Q. M. Jonathan Wu and Pradeep K.
                 Atrey",
  title =        "Secure randomized image watermarking based on singular
                 value decomposition",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2542205.2542207",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, a novel logo watermarking scheme is
                 proposed based on wavelet frame transform, singular
                 value decomposition and automatic thresholding. The
                 proposed scheme essentially rectifies the ambiguity
                 problem in the SVD-based watermarking. The core idea is
                 to randomly upscale the size of host image using
                 reversible random extension transform followed by the
                 embedding of logo watermark in the wavelet frame
                 domain. After embedding, a verification phase is casted
                 with the help of a binary watermark and toral
                 automorphism. At the extraction end, the binary
                 watermark is first extracted followed by the
                 verification of watermarked image. The logo watermark
                 is extracted if and only if the watermarked image is
                 verified. The security, attack and comparative analysis
                 confirm high security, efficiency and robustness of the
                 proposed watermarking system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mou:2013:CBC,
  author =       "Luntian Mou and Tiejun Huang and Yonghong Tian and
                 Menglin Jiang and Wen Gao",
  title =        "Content-based copy detection through multimodal
                 feature representation and temporal pyramid matching",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2542205.2542208",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Content-based copy detection (CBCD) is drawing
                 increasing attention as an alternative technology to
                 watermarking for video identification and copyright
                 protection. In this article, we present a comprehensive
                 method to detect copies that are subjected to
                 complicated transformations. A multimodal feature
                 representation scheme is designed to exploit the
                 complementarity of audio features, global and local
                 visual features so that optimal overall robustness to a
                 wide range of complicated modifications can be
                 achieved. Meanwhile, a temporal pyramid matching
                 algorithm is proposed to assemble frame-level
                 similarity search results into sequence-level matching
                 results through similarity evaluation over multiple
                 temporal granularities. Additionally, inverted indexing
                 and locality sensitive hashing (LSH) are also adopted
                 to speed up similarity search. Experimental results
                 over benchmarking datasets of TRECVID 2010 and 2009
                 demonstrate that the proposed method outperforms other
                 methods for most transformations in terms of copy
                 detection accuracy. The evaluation results also suggest
                 that our method can achieve competitive copy
                 localization preciseness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2013:LSM,
  author =       "Xiangyu Chen and Yadong Mu and Hairong Liu and
                 Shuicheng Yan and Yong Rui and Tat-Seng Chua",
  title =        "Large-scale multilabel propagation based on efficient
                 sparse graph construction",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2542205.2542209",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the popularity of photo-sharing websites, the
                 number of web images has exploded into unseen
                 magnitude. Annotating such large-scale data will cost
                 huge amount of human resources and is thus
                 unaffordable. Motivated by this challenging problem, we
                 propose a novel sparse graph based multilabel
                 propagation (SGMP) scheme for super large scale
                 datasets. Both the efficacy and accuracy of the image
                 annotation are further investigated under different
                 graph construction strategies, where Gaussian noise and
                 non-Gaussian sparse noise are simultaneously considered
                 in the formulations of these strategies. Our proposed
                 approach outperforms the state-of-the-art algorithms by
                 focusing on: (1) For large-scale graph construction, a
                 simple yet efficient LSH (Locality Sensitive
                 Hashing)-based sparse graph construction scheme is
                 proposed to speed up the construction. We perform the
                 multilabel propagation on this hashing-based graph
                 construction, which is derived with LSH approach
                 followed by sparse graph construction within the
                 individual hashing buckets; (2) To further improve the
                 accuracy, we propose a novel sparsity induced scalable
                 graph construction scheme, which is based on a general
                 sparse optimization framework. Sparsity essentially
                 implies a very strong prior: for large scale
                 optimization, the values of most variables shall be
                 zeros when the solution reaches the optimum. By
                 utilizing this prior, the solutions of large-scale
                 sparse optimization problems can be derived by solving
                 a series of much smaller scale subproblems; (3) For
                 multilabel propagation, different from the traditional
                 algorithms that propagate over individual label
                 independently, our proposed propagation first encodes
                 the label information of an image as a unit label
                 confidence vector and naturally imposes inter-label
                 constraints and manipulates labels interactively. Then,
                 the entire propagation problem is formulated on the
                 concept of Kullback--Leibler divergence defined on
                 probabilistic distributions, which guides the
                 propagation of the supervision information. Extensive
                 experiments on the benchmark dataset NUS-WIDE with 270k
                 images and its lite version NUS-WIDE-LITE with 56k
                 images well demonstrate the effectiveness and
                 scalability of the proposed multi-label propagation
                 scheme.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Houle:2013:API,
  author =       "Michael E. Houle and Vincent Oria and Shin'ichi Satoh
                 and Jichao Sun",
  title =        "Annotation propagation in image databases using
                 similarity graphs",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2487736",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The practicality of large-scale image indexing and
                 querying methods depends crucially upon the
                 availability of semantic information. The manual
                 tagging of images with semantic information is in
                 general very labor intensive, and existing methods for
                 automated image annotation may not always yield
                 accurate results. The aim of this paper is to reduce to
                 a minimum the amount of human intervention required in
                 the semantic annotation of images, while preserving a
                 high degree of accuracy. Ideally, only one copy of each
                 object of interest would be labeled manually, and the
                 labels would then be propagated automatically to all
                 other occurrences of the objects in the database. To
                 this end, we propose an influence propagation strategy,
                 SW-KProp, that requires no human intervention beyond
                 the initial labeling of a subset of the images.
                 SW-KProp distributes semantic information within a
                 similarity graph defined on all images in the database:
                 each image iteratively transmits its current label
                 information to its neighbors, and then readjusts its
                 own label according to the combined influences of its
                 neighbors. SW-KProp influence propagation can be
                 efficiently performed by means of matrix computations,
                 provided that pairwise similarities of images are
                 available. We also propose a variant of SW-KProp which
                 enhances the quality of the similarity graph by
                 selecting a reduced feature set for each prelabeled
                 image and rebuilding its neighborhood. The performances
                 of the SW-KProp method and its variant were evaluated
                 against several competing methods on classification
                 tasks for three image datasets: a handwritten digit
                 dataset, a face dataset and a web image dataset. For
                 the digit images, SW-KProp and its variant performed
                 consistently better than the other methods tested. For
                 the face and web images, SW-KProp outperformed its
                 competitors for the case when the number of prelabeled
                 images was relatively small. The performance was seen
                 to improve significantly when the feature selection
                 strategy was applied.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mallik:2013:MOR,
  author =       "Anupama Mallik and Hiranmay Ghosh and Santanu
                 Chaudhury and Gaurav Harit",
  title =        "{MOWL}: an ontology representation language for
                 {Web}-based multimedia applications",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "1",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2542205.2542210",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Several multimedia applications need to reason with
                 concepts and their media properties in specific domain
                 contexts. Media properties of concepts exhibit some
                 unique characteristics that cannot be dealt with
                 conceptual modeling schemes followed in the existing
                 ontology representation and reasoning schemes. We have
                 proposed a new perceptual modeling technique for
                 reasoning with media properties observed in multimedia
                 instances and the latent concepts. Our knowledge
                 representation scheme uses a causal model of the world
                 where concepts manifest in media properties with
                 uncertainties. We introduce a probabilistic reasoning
                 scheme for belief propagation across domain concepts
                 through observation of media properties. In order to
                 support the perceptual modeling and reasoning paradigm,
                 we propose a new ontology language, Multimedia Web
                 Ontology Language (MOWL). Our primary contribution in
                 this article is to establish the need for the new
                 ontology language and to introduce the semantics of its
                 novel language constructs. We establish the generality
                 of our approach with two disparate knowledge-intensive
                 applications involving reasoning with media properties
                 of concepts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Deng:2014:DLB,
  author =       "Yunhua Deng and Rynson W. H. Lau",
  title =        "Dynamic load balancing in distributed virtual
                 environments using heat diffusion",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499906",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Distributed virtual environments (DVEs) are attracting
                 a lot of attention in recent years, due to the
                 increasing popularity of online gaming and social
                 networks. As the number of concurrent users of a DVE
                 increases, a critical problem is on how the workload
                 among multiple servers can be balanced in order to
                 maintain real-time performance. Although a number of
                 load balancing methods have been proposed, they either
                 try to produce high quality load balancing results and
                 become too slow or emphasize on efficiency and the load
                 balancing results become less effective. In this
                 article, we propose a new approach to address this
                 problem based on heat diffusion. Our work has two main
                 contributions. First, we propose a local and a global
                 load balancing methods for DVEs based on heat
                 diffusion. Second, we investigate two performance
                 factors of the proposed methods, the convergence
                 threshold and the load balancing interval. We have
                 conducted a number of experiments to extensively
                 evaluate the performance of the proposed methods. Our
                 experimental results show that the proposed methods
                 outperform existing methods in that our methods are
                 effective in reducing server overloading while at the
                 same time being efficient.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{She:2014:CID,
  author =       "James She and Jon Crowcroft and Hao Fu and Flora Li",
  title =        "Convergence of interactive displays with smart mobile
                 devices for effective advertising: a survey",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "17:1--17:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2557450",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The trend of replacing public static signages with
                 digital displays creates opportunities for interactive
                 display systems, which can be used in collaborative
                 workspaces, social gaming platforms and advertising.
                 Based on marketing communication concepts and existing
                 models for consumer behavior, three stages, namely
                 attraction, interaction and conation, are defined in
                 this article to analyze the effectiveness of
                 interactive display advertising. By reviewing various
                 methods and strategies employed by existing systems
                 with attraction, interaction and conation stages, this
                 article concludes that smart mobile devices should be
                 integrated as a component to increase the effectiveness
                 of interactive displays as advertising tools. Future
                 research challenges related to this topic are also
                 discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gonina:2014:SMC,
  author =       "Ekaterina Gonina and Gerald Friedland and Eric
                 Battenberg and Penporn Koanantakool and Michael
                 Driscoll and Evangelos Georganas and Kurt Keutzer",
  title =        "Scalable multimedia content analysis on parallel
                 platforms using {Python}",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517151",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this new era dominated by consumer-produced media
                 there is a high demand for web-scalable solutions to
                 multimedia content analysis. A compelling approach to
                 making applications scalable is to explicitly map their
                 computation onto parallel platforms. However,
                 developing efficient parallel implementations and fully
                 utilizing the available resources remains a challenge
                 due to the increased code complexity, limited
                 portability and required low-level knowledge of the
                 underlying hardware. In this article, we present
                 PyCASP, a Python-based framework that automatically
                 maps computation onto parallel platforms from Python
                 application code to a variety of parallel platforms.
                 PyCASP is designed using a systematic, pattern-oriented
                 approach to offer a single software development
                 environment for multimedia content analysis
                 applications. Using PyCASP, applications can be
                 prototyped in a couple hundred lines of Python code and
                 automatically scale to modern parallel processors.
                 Applications written with PyCASP are portable to a
                 variety of parallel platforms and efficiently scale
                 from a single desktop Graphics Processing Unit (GPU) to
                 an entire cluster with a small change to application
                 code. To illustrate our approach, we present three
                 multimedia content analysis applications that use our
                 framework: a state-of-the-art speaker diarization
                 application, a content-based music recommendation
                 system based on the Million Song Dataset, and a video
                 event detection system for consumer-produced videos. We
                 show that across this wide range of applications, our
                 approach achieves the goal of automatic portability and
                 scalability while at the same time allowing easy
                 prototyping in a high-level language and efficient
                 performance of low-level optimized code.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chandra:2014:HPM,
  author =       "Surendar Chandra and John Boreczky and Lawrence A.
                 Rowe",
  title =        "High performance many-to-many intranet screen sharing
                 with {DisplayCast}",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "19:1--19:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2534328",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "DisplayCast is a many to many Intranet screen sharing
                 system. Its screen capture mechanism creates a sequence
                 of pixmap images of the screen updates. Prior systems
                 that used a similar approach were designed to operate
                 over constrained wide-area networks and did not exploit
                 the Intranet network conditions to achieve high capture
                 rates. First we empirically analyzed the screen
                 contents for a variety of scenarios. We showed that
                 screen updates were sporadic with long periods of
                 inactivity. When active, screens were updated at far
                 higher rates than was supported by earlier systems. The
                 mismatch was pronounced for interactive scenarios. Even
                 during active screen updates, the number of updated
                 pixels were frequently small. We showed that crucial
                 information can be lost if individual updates were
                 merged. When the available system resources could not
                 support high capture rates, we showed ways in which
                 updates can be effectively collapsed. Next, we
                 investigate compression mechanisms for streaming these
                 updates. Even while using a hardware encoder, lossy
                 compressors such as H.264 were unable to sustain high
                 frame rates. Though Zlib lossless compression operated
                 within the latency and compression rate requirements,
                 the compression efficiency was poor. By analyzing the
                 screen pixels, we developed a practical transformation
                 that significantly improved compression rates.
                 DisplayCast incorporates these observations. It shares
                 the processor and network resources required for screen
                 capture, compression and transmission with host
                 applications whose output needs to be shared.
                 DisplayCast is agile and uses faster processing
                 capability to achieve even higher performance. Our
                 system components operate natively in Windows 7, Mac OS
                 X and iOS and is deployed in a production setting.
                 DisplayCast is released under a New BSD License.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lee:2014:NDH,
  author =       "Ya-Lin Lee and Wen-Hsiang Tsai",
  title =        "A new data hiding method via revision history records
                 on collaborative writing platforms",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "20:1--20:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2534408",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A new data hiding method via collaboratively-written
                 articles with forged revision history records on
                 collaborative writing platforms is proposed. The hidden
                 message is camouflaged as a stego-document consisting
                 of a stego-article and a revision history created
                 through a simulated process of collaborative writing.
                 The revisions are forged using a database constructed
                 by mining word sequences used in real cases from an
                 English Wikipedia XML dump. Four characteristics of
                 article revisions are identified and utilized to embed
                 secret messages, including the author of each revision,
                 the number of corrected word sequences, the content of
                 the corrected word sequences, and the word sequences
                 replacing the corrected ones. Related problems arising
                 in utilizing these characteristics for data hiding are
                 identified and solved skillfully, resulting in an
                 effective multiway method for hiding secret messages
                 into the revision history. To create more realistic
                 revisions, Huffman coding based on the word sequence
                 frequencies collected from Wikipedia is applied to
                 encode the word sequences. Good experimental results
                 show the feasibility of the proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yuan:2014:MRB,
  author =       "Jin Yuan and Yi-Liang Zhao and Huanbo Luan and Meng
                 Wang and Tat-Seng Chua",
  title =        "Memory recall based video search: Finding videos you
                 have seen before based on your memory",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "21:1--21:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2534409",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We often remember images and videos that we have seen
                 or recorded before but cannot quite recall the exact
                 venues or details of the contents. We typically have
                 vague memories of the contents, which can often be
                 expressed as a textual description and/or rough visual
                 descriptions of the scenes. Using these vague memories,
                 we then want to search for the corresponding videos of
                 interest. We call this ``Memory Recall based Video
                 Search'' (MRVS). To tackle this problem, we propose a
                 video search system that permits a user to input
                 his/her vague and incomplete query as a combination of
                 text query, a sequence of visual queries, and/or
                 concept queries. Here, a visual query is often in the
                 form of a visual sketch depicting the outline of scenes
                 within the desired video, while each corresponding
                 concept query depicts a list of visual concepts that
                 appears in that scene. As the query specified by users
                 is generally approximate or incomplete, we need to
                 develop techniques to handle this inexact and
                 incomplete specification by also leveraging on user
                 feedback to refine the specification. We utilize
                 several innovative approaches to enhance the automatic
                 search. First, we employ a visual query suggestion
                 model to automatically suggest potential visual
                 features to users as better queries. Second, we utilize
                 a color similarity matrix to help compensate for
                 inexact color specification in visual queries. Third,
                 we leverage on the ordering of visual queries and/or
                 concept queries to rerank the results by using a greedy
                 algorithm. Moreover, as the query is inexact and there
                 is likely to be only one or few possible answers, we
                 incorporate an interactive feedback loop to permit the
                 users to label related samples which are visually
                 similar or semantically close to the relevant sample.
                 Based on the labeled samples, we then propose
                 optimization algorithms to update visual queries and
                 concept weights to refine the search results. We
                 conduct experiments on two large-scale video datasets:
                 TRECVID 2010 and YouTube. The experimental results
                 demonstrate that our proposed system is effective for
                 MRVS tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2014:MIK,
  author =       "Xianglong Liu and Yadong Mu and Bo Lang and Shih-Fu
                 Chang",
  title =        "Mixed image-keyword query adaptive hashing over
                 multilabel images",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "2",
  pages =        "22:1--22:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2540990",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 13 07:37:57 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article defines a new hashing task motivated by
                 real-world applications in content-based image
                 retrieval, that is, effective data indexing and
                 retrieval given mixed query (query image together with
                 user-provided keywords). Our work is distinguished from
                 state-of-the-art hashing research by two unique
                 features: (1) Unlike conventional image retrieval
                 systems, the input query is a combination of an
                 exemplar image and several descriptive keywords, and
                 (2) the input image data are often associated with
                 multiple labels. It is an assumption that is more
                 consistent with the realistic scenarios. The mixed
                 image-keyword query significantly extends traditional
                 image-based query and better explicates the user
                 intention. Meanwhile it complicates semantics-based
                 indexing on the multilabel data. Though several
                 existing hashing methods can be adapted to solve the
                 indexing task, unfortunately they all prove to suffer
                 from low effectiveness. To enhance the hashing
                 efficiency, we propose a novel scheme ``boosted shared
                 hashing''. Unlike prior works that learn the hashing
                 functions on either all image labels or a single label,
                 we observe that the hashing function can be more
                 effective if it is designed to index over an optimal
                 label subset. In other words, the association between
                 labels and hash bits are moderately sparse. The
                 sparsity of the bit-label association indicates greatly
                 reduced computation and storage complexities for
                 indexing a new sample, since only limited number of
                 hashing functions will become active for the specific
                 sample. We develop a Boosting style algorithm for
                 simultaneously optimizing both the optimal label
                 subsets and hashing functions in a unified formulation,
                 and further propose a query-adaptive retrieval
                 mechanism based on hash bit selection for mixed
                 queries, no matter whether or not the query words exist
                 in the training data. Moreover, we show that the
                 proposed method can be easily extended to the case
                 where the data similarity is gauged by nonlinear kernel
                 functions. Extensive experiments are conducted on
                 standard image benchmarks like CIFAR-10, NUS-WIDE and
                 a-TRECVID. The results validate both the sparsity of
                 the bit-label association and the convergence of the
                 proposed algorithm, and demonstrate that the proposed
                 hashing scheme achieves substantially superior
                 performances over state-of-the-art methods under the
                 same hash bit budget.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Anonymous:2014:TCO,
  author =       "Anonymous",
  title =        "Table of Contents: Online Supplement Volume 10, Number
                 1s",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "22:1--22:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2602969",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2014:DUB,
  author =       "Ning Liu and Huajie Cui and S.-H. Gary Chan and
                 Zhipeng Chen and Yirong Zhuang",
  title =        "Dissecting User Behaviors for a Simultaneous Live and
                 {VoD IPTV} System",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "23:1--23:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2568194",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "IPTV services deployed nowadays often consist of both
                 live TV and Video-on-Demand (VoD), offered by the same
                 service provider to the same pool of users over the
                 same managed network. Understanding user behaviors in
                 such a setting is hence an important step for system
                 modelling and optimization. Previous studies on user
                 behavior on video services were on either live TV or
                 VoD. For the first time, we conduct an in-depth
                 large-scale behavior study for IPTV users offering
                 simultaneously live TV and VoD choices at the same
                 time. Our data is from the largest IPTV service
                 provider in China, offering hundreds of live channels
                 and hundreds of thousands of VoD files, with traces
                 covering more than 1.9 million users over a period of 5
                 months. This large dataset provides us a unique
                 opportunity to cross-compare user viewing behaviors for
                 these services on the same platform, and sheds valuable
                 insights on how users interact with such a simultaneous
                 system. Our results lead to new understanding on IPTV
                 user behaviors which have strong implications on system
                 design. For example, we find that the average holding
                 time for VoD is significantly longer than live TV. live
                 TV users tend to surf more. However, if such channel
                 surfing is discounted, the holding times of both
                 services are not much different. While users in VoD
                 tend to view HD longer, channel popularity for live TV
                 is much less dependent on its video quality. In
                 contrast to some popular assumptions on user
                 interactivity, the transitions among live TV, VoD, and
                 offline modes are far from a Markov model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gaeta:2014:DDI,
  author =       "Rossano Gaeta and Marco Grangetto and Lorenzo Bovio",
  title =        "{DIP}: {Distributed Identification of Polluters} in
                 {P2P} Live Streaming",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "24:1--24:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2568223",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Peer-to-peer live streaming applications are
                 vulnerable to malicious actions of peers that
                 deliberately modify data to decrease or prevent the
                 fruition of the media (pollution attack). In this
                 article we propose DIP, a fully distributed, accurate,
                 and robust algorithm for the identification of
                 polluters. DIP relies on checks that are computed by
                 peers upon completing reception of all blocks composing
                 a data chunk. A check is a special message that
                 contains the set of peer identifiers that provided
                 blocks of the chunk as well as a bit to signal if the
                 chunk has been corrupted. Checks are periodically
                 transmitted by peers to their neighbors in the overlay
                 network; peers receiving checks use them to maintain a
                 factor graph. This graph is bipartite and an
                 incremental belief propagation algorithm is run on it
                 to compute the probability of a peer being a polluter.
                 Using a prototype deployed over PlanetLab we show by
                 extensive experimentation that DIP allows honest peers
                 to identify polluters with very high accuracy and
                 completeness, even when polluters collude to deceive
                 them. Furthermore, we show that DIP is efficient,
                 requiring low computational, communication, and storage
                 overhead at each peer.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hoque:2014:SEM,
  author =       "Mohammad Asharful Hoque and Matti Siekkinen and Jukka
                 K. Nurminen and Sasu Tarkoma and Mika Aalto",
  title =        "Saving Energy in Mobile Devices for On-Demand
                 Multimedia Streaming --- A Cross-Layer Approach",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "25:1--25:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556942",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes a novel energy-efficient
                 multimedia delivery system called EStreamer. First, we
                 study the relationship between buffer size at the
                 client, burst-shaped TCP-based multimedia traffic, and
                 energy consumption of wireless network interfaces in
                 smartphones. Based on the study, we design and
                 implement EStreamer for constant bit rate and
                 rate-adaptive streaming. EStreamer can improve battery
                 lifetime by 3x, 1.5x, and 2x while streaming over
                 Wi-Fi, 3G, and 4G, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2014:HEK,
  author =       "Feng Wang and Wan-Lei Zhao and Chong-Wah Ngo and
                 Bernard Merialdo",
  title =        "A {Hamming} Embedding Kernel with Informative
                 Bag-of-Visual Words for Video Semantic Indexing",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "26:1--26:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535938",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose a novel Hamming embedding
                 kernel with informative bag-of-visual words to address
                 two main problems existing in traditional BoW
                 approaches for video semantic indexing. First, Hamming
                 embedding is employed to alleviate the information loss
                 caused by SIFT quantization. The Hamming distances
                 between keypoints in the same cell are calculated and
                 integrated into the SVM kernel to better discriminate
                 different image samples. Second, to highlight the
                 concept-specific visual information, we propose to
                 weight the visual words according to their
                 informativeness for detecting specific concepts. We
                 show that our proposed kernels can significantly
                 improve the performance of concept detection.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2014:MDF,
  author =       "Ying Yang and Ioannis Ivrissimtzis",
  title =        "Mesh Discriminative Features for {$3$D} Steganalysis",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "27:1--27:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535555",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We propose a steganalytic algorithm for triangle
                 meshes, based on the supervised training of a
                 classifier by discriminative feature vectors. After a
                 normalization step, the triangle mesh is calibrated by
                 one step of Laplacian smoothing and then a feature
                 vector is computed, encoding geometric information
                 corresponding to vertices, edges and faces. For a given
                 steganographic or watermarking algorithm, we create a
                 training set containing unmarked meshes and meshes
                 marked by that algorithm, and train a classifier using
                 Quadratic Discriminant Analysis. The performance of the
                 proposed method was evaluated on six well-known
                 watermarking/steganographic schemes with satisfactory
                 accuracy rates.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hamam:2014:QEM,
  author =       "Abdelwahab Hamam and Abdulmotaleb {El Saddik} and
                 Jihad Alja'am",
  title =        "A Quality of Experience Model for Haptic Virtual
                 Environments",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "28:1--28:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2540991",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Haptic-based Virtual Reality (VR) applications have
                 many merits. What is still obscure, from the designer's
                 perspective of these applications, is the experience
                 the users will undergo when they use the VR system.
                 Quality of Experience (QoE) is an evaluation metric
                 from the user's perspective that unfortunately has
                 received limited attention from the research community.
                 Assessing the QoE of VR applications reflects the
                 amount of overall satisfaction and benefits gained from
                 the application in addition to laying the foundation
                 for ideal user-centric design in the future. In this
                 article, we propose a taxonomy for the evaluation of
                 QoE for multimedia applications and in particular VR
                 applications. We model this taxonomy using a Fuzzy
                 logic Inference System (FIS) to quantitatively measure
                 the QoE of haptic virtual environments. We build and
                 test our FIS by conducting a users' study analysis to
                 evaluate the QoE of a haptic game application. Our
                 results demonstrate that the proposed FIS model
                 reflects the user's estimation of the application's
                 quality significantly with low error and hence is
                 suited for QoE evaluation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Botta:2014:PCI,
  author =       "Marco Botta and Davide Cavagnino and Victor Pomponiu",
  title =        "Protecting the Content Integrity of Digital Imagery
                 with Fidelity Preservation: An Improved Version",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "29:1--29:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2568224",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Fragile watermarking has attracted a lot of attention
                 in the last decade. An interesting approach, presented
                 in 2011 by Lin et al., results in very high quality of
                 the watermarked images. However, after a thorough
                 examination of the paper, a few improvements are
                 proposed in our revised version of the algorithm in
                 order to overcome some shortcomings. In particular,
                 changes to the pseudocode and modifications to deal
                 with pixel saturation are suggested, along with a way
                 to improve the scheme security. Finally, a deeper
                 analysis of the security is presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Luo:2014:ICH,
  author =       "Da Luo and Weiqi Luo and Rui Yang and Jiwu Huang",
  title =        "Identifying Compression History of Wave Audio and Its
                 Applications",
  journal =      j-TOMCCAP,
  volume =       "10",
  number =       "3",
  pages =        "30:1--30:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2575978",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Apr 15 12:20:53 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Audio signal is sometimes stored and/or processed in
                 WAV (waveform) format without any knowledge of its
                 previous compression operations. To perform some
                 subsequent processing, such as digital audio forensics,
                 audio enhancement and blind audio quality assessment,
                 it is necessary to identify its compression history. In
                 this article, we will investigate how to identify a
                 decompressed wave audio that went through one of three
                 popular compression schemes, including MP3, WMA
                 (windows media audio) and AAC (advanced audio coding).
                 By analyzing the corresponding frequency coefficients,
                 including modified discrete cosine transform (MDCT) and
                 Mel-frequency cepstral coefficients (MFCCs), of those
                 original audio clips and their decompressed versions
                 with different compression schemes and bit rates, we
                 propose several statistics to identify the compression
                 scheme as well as the corresponding bit rate previously
                 used for a given WAV signal. The experimental results
                 evaluated on 8,800 audio clips with various contents
                 have shown the effectiveness of the proposed method. In
                 addition, some potential applications of the proposed
                 method are discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

%%% ====================================================================
%%% From the ACM Portal Web site: ``On 23rd May 2014, ACM TOMCCAP
%%% changed its acronym to ACM TOMM. This acronym change was the result
%%% of extensive discussions between the journal Editorial Board and
%%% SIGMM constituents dating back to 2011. This name change emphasizes
%%% the continued strong collaboration with the ACM Multimedia
%%% conference (ACMMM).''
%%% ====================================================================

@Article{Zhang:2014:CDM,
  author =       "Tianzhu Zhang and Changsheng Xu",
  title =        "Cross-Domain Multi-Event Tracking via {CO-PMHT}",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2602633",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the massive growth of events on the Internet,
                 efficient organization and monitoring of events becomes
                 a practical challenge. To deal with this problem, we
                 propose a novel CO-PMHT (CO-Probabilistic
                 Multi-Hypothesis Tracking) algorithm for cross-domain
                 multi-event tracking to obtain their informative
                 summary details and evolutionary trends over time. We
                 collect a large-scale dataset by searching keywords on
                 two domains (Gooogle News and Flickr) and downloading
                 both images and textual content for an event. Given the
                 input data, our algorithm can track multiple events in
                 the two domains collaboratively and boost the tracking
                 performance. Specifically, the bridge between two
                 domains is a semantic posterior probability, that
                 avoids the domain gap. After tracking, we can visualize
                 the whole evolutionary process of the event over time
                 and mine the semantic topics of each event for deep
                 understanding and event prediction. The extensive
                 experimental evaluations on the collected dataset well
                 demonstrate the effectiveness of the proposed algorithm
                 for cross-domain multi-event tracking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2014:PVR,
  author =       "Qinghua Huang and Bisheng Chen and Jingdong Wang and
                 Tao Mei",
  title =        "Personalized Video Recommendation through Graph
                 Propagation",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2598779",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The rapid growth of the number of videos on the
                 Internet provides enormous potential for users to find
                 content of interest. However, the vast quantity of
                 videos also turns the finding process into a difficult
                 task. In this article, we address the problem of
                 providing personalized video recommendation for users.
                 Rather than only exploring the user-video bipartite
                 graph that is formulated using click information, we
                 first combine the clicks and queries information to
                 build a tripartite graph. In the tripartite graph, the
                 query nodes act as bridges to connect user nodes and
                 video nodes. Then, to further enrich the connections
                 between users and videos, three subgraphs between the
                 same kinds of nodes are added to the tripartite graph
                 by exploring content-based information (video tags and
                 textual queries). We propose an iterative propagation
                 algorithm over the enhanced graph to compute the
                 preference information of each user. Experiments
                 conducted on a dataset with 1,369 users, 8,765 queries,
                 and 17,712 videos collected from a commercial video
                 search engine demonstrate the effectiveness of the
                 proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2014:UVS,
  author =       "Haitao Li and Xu Cheng and Jiangchuan Liu",
  title =        "Understanding Video Sharing Propagation in Social
                 Networks: Measurement and Analysis",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2594440",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Modern online social networking has drastically
                 changed the information distribution landscape.
                 Recently, video has become one of the most important
                 types of objects spreading among social networking
                 service users. The sheer and ever-increasing data
                 volume, the broader coverage, and the longer access
                 durations of video objects, however, present
                 significantly more challenges than other types of
                 objects. This article takes an initial step toward
                 understanding the unique characteristics of video
                 sharing propagation in social networks. Based on
                 realworld data traces from a large-scale online social
                 network, we examine the user behavior from diverse
                 aspects and identify different types of users involved
                 in video propagation. We closely investigate the
                 temporal distribution during propagation as well as the
                 typical propagation structures, revealing more details
                 beyond stationary coverage. We further extend the
                 conventional epidemic models to accommodate diverse
                 types of users and their probabilistic viewing and
                 sharing behaviors. The model, effectively capturing the
                 essentials of the propagation process, serves as a
                 valuable basis for such applications as workload
                 synthesis, traffic prediction, and resource provision
                 of video servers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2014:BCM,
  author =       "Zhiyu Wang and Peng Cui and Lexing Xie and Wenwu Zhu
                 and Yong Rui and Shiqiang Yang",
  title =        "Bilateral Correspondence Model for Words-and-Pictures
                 Association in Multimedia-Rich Microblogs",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2611388",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Nowadays, the amount of multimedia contents in
                 microblogs is growing significantly. More than 20\% of
                 microblogs link to a picture or video in certain large
                 systems. The rich semantics in microblogs provides an
                 opportunity to endow images with higher-level semantics
                 beyond object labels. However, this raises new
                 challenges for understanding the association between
                 multimodal multimedia contents in multimedia-rich
                 microblogs. Disobeying the fundamental assumptions of
                 traditional annotation, tagging, and retrieval systems,
                 pictures and words in multimedia-rich microblogs are
                 loosely associated and a correspondence between
                 pictures and words cannot be established. To address
                 the aforementioned challenges, we present the first
                 study analyzing and modeling the associations between
                 multimodal contents in microblog streams, aiming to
                 discover multimodal topics from microblogs by
                 establishing correspondences between pictures and words
                 in microblogs. We first use a data-driven approach to
                 analyze the new characteristics of the words, pictures,
                 and their association types in microblogs. We then
                 propose a novel generative model called the Bilateral
                 Correspondence Latent Dirichlet Allocation (BC-LDA)
                 model. Our BC-LDA model can assign flexible
                 associations between pictures and words and is able to
                 not only allow picture-word co-occurrence with
                 bilateral directions, but also single modal
                 association. This flexible association can best fit the
                 data distribution, so that the model can discover
                 various types of joint topics and generate pictures and
                 words with the topics accordingly. We evaluate this
                 model extensively on a large-scale real multimedia-rich
                 microblogs dataset. We demonstrate the advantages of
                 the proposed model in several application scenarios,
                 including image tagging, text illustration, and topic
                 discovery. The experimental results demonstrate that
                 our proposed model can significantly and consistently
                 outperform traditional approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lei:2014:FND,
  author =       "Yanqiang Lei and Guoping Qiu and Ligang Zheng and Jiwu
                 Huang",
  title =        "Fast Near-Duplicate Image Detection Using Uniform
                 Randomized Trees",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2602186",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Indexing structure plays an important role in the
                 application of fast near-duplicate image detection,
                 since it can narrow down the search space. In this
                 article, we develop a cluster of uniform randomized
                 trees (URTs) as an efficient indexing structure to
                 perform fast near-duplicate image detection. The main
                 contribution in this article is that we introduce
                 ``uniformity'' and ``randomness'' into the indexing
                 construction. The uniformity requires classifying the
                 object images into the same scale subsets. Such a
                 decision makes good use of the two facts in
                 near-duplicate image detection, namely: (1) the number
                 of categories is huge; (2) a single category usually
                 contains only a small number of images. Therefore, the
                 uniform distribution is very beneficial to narrow down
                 the search space and does not significantly degrade the
                 detection accuracy. The randomness is embedded into the
                 generation of feature subspace and projection
                 direction, improving the flexibility of indexing
                 construction. The experimental results show that the
                 proposed method is more efficient than the popular
                 locality-sensitive hashing and more stable and flexible
                 than the traditional KD-tree.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yeh:2014:PPR,
  author =       "Che-Hua Yeh and Brian A. Barsky and Ming Ouhyoung",
  title =        "Personalized Photograph Ranking and Selection System
                 Considering Positive and Negative User Feedback",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2584105",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose a novel personalized
                 ranking system for amateur photographs. The proposed
                 framework treats the photograph assessment as a ranking
                 problem and we introduce the idea of personalized
                 ranking, which ranks photographs considering both their
                 aesthetic qualities and personal preferences.
                 Photographs are described using three types of
                 features: photo composition, color and intensity
                 distribution, and personalized features. An aesthetic
                 prediction model is learned from labeled photographs by
                 using the proposed image features and RBF-ListNet
                 learning algorithm. The experimental results show that
                 the proposed framework outperforms in the ranking
                 performance: a Kendall's tau value of 0.432 is
                 significantly higher than those obtained by the
                 features proposed in one of the state-of-the-art
                 approaches (0.365) and by learning based on support
                 vector regression (0.384). To realize personalization
                 in ranking, three approaches are proposed: the
                 feature-based approach allows users to select
                 photographs with specific rules, the example-based
                 approach takes the positive feedback from users to
                 rerank the photograph, and the list-based approach
                 takes both positive and negative feedback from users
                 into consideration. User studies indicate that all
                 three approaches are effective in both aesthetic and
                 personalized ranking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tan:2014:PVS,
  author =       "Song Tan and Yu-Gang Jiang and Chong-Wah Ngo",
  title =        "Placing Videos on a Semantic Hierarchy for Search
                 Result Navigation",
  journal =      j-TOMM,
  volume =       "10",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jun,
  year =         "2014",
  DOI =          "https://doi.org/10.1145/2578394",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 8 11:32:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Organizing video search results in a list view is
                 widely adopted by current commercial search engines,
                 which cannot support efficient browsing for complex
                 search topics that have multiple semantic facets. In
                 this article, we propose to organize video search
                 results in a highly structured way. Specifically,
                 videos are placed on a semantic hierarchy that
                 accurately organizes various facets of a given search
                 topic. To pick the most suitable videos for each node
                 of the hierarchy, we define and utilize three important
                 criteria: relevance, uniqueness, and diversity.
                 Extensive evaluations on a large YouTube video dataset
                 demonstrate the effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Steinmetz:2014:EN,
  author =       "Ralf Steinmetz",
  title =        "Editorial Note",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2634234",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2014:SBA,
  author =       "Yong-Jin Liu and Cui-Xia Ma and Qiufang Fu and Xiaolan
                 Fu and Sheng-Feng Qin and Lexing Xie",
  title =        "A Sketch-Based Approach for Interactive Organization
                 of Video Clips",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2645643",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the rapid growth of video resources, techniques
                 for efficient organization of video clips are becoming
                 appealing in the multimedia domain. In this article, a
                 sketch-based approach is proposed to intuitively
                 organize video clips by: (1) enhancing their narrations
                 using sketch annotations and (2) structurizing the
                 organization process by gesture-based free-form
                 sketching on touch devices. There are two main
                 contributions of this work. The first is a sketch
                 graph, a novel representation for the narrative
                 structure of video clips to facilitate content
                 organization. The second is a method to perform
                 context-aware sketch recommendation scalable to large
                 video collections, enabling common users to easily
                 organize sketch annotations. A prototype system
                 integrating the proposed approach was evaluated on the
                 basis of five different aspects concerning its
                 performance and usability. Two sketch searching
                 experiments showed that the proposed context-aware
                 sketch recommendation outperforms, in terms of accuracy
                 and scalability, two state-of-the-art sketch searching
                 methods. Moreover, a user study showed that the sketch
                 graph is consistently preferred over traditional
                 representations such as keywords and keyframes. The
                 second user study showed that the proposed approach is
                 applicable in those scenarios where the video annotator
                 and organizer were the same person. The third user
                 study showed that, for video content organization,
                 using sketch graph users took on average 1/3 less time
                 than using a mass-market tool Movie Maker and took on
                 average 1/4 less time than using a state-of-the-art
                 sketch alternative. These results demonstrated that the
                 proposed sketch graph approach is a promising video
                 organization tool.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2014:CSA,
  author =       "Junshi Huang and Si Liu and Junliang Xing and Tao Mei
                 and Shuicheng Yan",
  title =        "Circle \& Search: Attribute-Aware Shoe Retrieval",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632165",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Taking the shoe as a concrete example, we present an
                 innovative product retrieval system that leverages
                 object detection and retrieval techniques to support a
                 brand-new online shopping experience in this article.
                 The system, called Circle \& Search, enables users to
                 naturally indicate any preferred product by simply
                 circling the product in images as the visual query, and
                 then returns visually and semantically similar products
                 to the users. The system is characterized by
                 introducing attributes in both the detection and
                 retrieval of the shoe. Specifically, we first develop
                 an attribute-aware part-based shoe detection model. By
                 maintaining the consistency between shoe parts and
                 attributes, this shoe detector has the ability to model
                 high-order relations between parts and thus the
                 detection performance can be enhanced. Meanwhile, the
                 attributes of this detected shoe can also be predicted
                 as the semantic relations between parts. Based on the
                 result of shoe detection, the system ranks all the
                 shoes in the repository using an attribute refinement
                 retrieval model that takes advantage of query-specific
                 information and attribute correlation to provide an
                 accurate and robust shoe retrieval. To evaluate this
                 retrieval system, we build a large dataset with 17,151
                 shoe images, in which each shoe is annotated with 10
                 shoe attributes e.g., heel height, heel shape, sole
                 shape, etc.. According to the experimental result and
                 the user study, our Circle \& Search system achieves
                 promising shoe retrieval performance and thus
                 significantly improves the users' online shopping
                 experience.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guan:2014:TAV,
  author =       "Genliang Guan and Zhiyong Wang and Shaohui Mei and Max
                 Ott and Mingyi He and David Dagan Feng",
  title =        "A Top-Down Approach for Video Summarization",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632267",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "While most existing video summarization approaches aim
                 to identify important frames of a video from either a
                 global or local perspective, we propose a top-down
                 approach consisting of scene identification and scene
                 summarization. For scene identification, we represent
                 each frame with global features and utilize a scalable
                 clustering method. We then formulate scene
                 summarization as choosing those frames that best cover
                 a set of local descriptors with minimal redundancy. In
                 addition, we develop a visual word-based approach to
                 make our approach more computationally scalable.
                 Experimental results on two benchmark datasets
                 demonstrate that our proposed approach clearly
                 outperforms the state-of-the-art.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pazzi:2014:PPP,
  author =       "Richard W. Pazzi and Azzedine Boukerche",
  title =        "{PROPANE}: a Progressive Panorama Streaming Protocol
                 to Support Interactive {$3$D} Virtual Environment
                 Exploration on Graphics-Constrained Devices",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2602222",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Image-Based Rendering (IBR) has become widely known by
                 its relatively low requirements for generating new
                 scenes based on a sequence of reference images. This
                 characteristic of IBR shows a remarkable potential
                 impact in rendering complex 3D virtual environments on
                 graphics-constrained devices, such as head-mounted
                 displays, set-top boxes, media streaming devices, and
                 so on. If well exploited, IBR coupled with remote
                 rendering would enable the exploration of complex
                 virtual environments on these devices. However, remote
                 rendering requires the transmission of a large volume
                 of images. In addition, existing solutions consider
                 limited and/or deterministic navigation schemes as a
                 means of decreasing the volume of streamed data. This
                 article proposes the PROgressive PANorama StrEaming
                 protocol (PROPANE) to offer users a smoother virtual
                 navigation experience by prestreaming the imagery data
                 required to generate new views as the user wanders
                 within a 3D environment. PROPANE is based on a very
                 simple yet effective trigonometry model and uses a
                 strafe (lateral movement) technique to minimize the
                 delay between image updates at the client end. This
                 article introduces the concept of key partial
                 panoramas, namely panorama segments that cover
                 movements in any direction by simply strafing from an
                 appropriate key partial panorama and streaming the
                 amount of lost pixels. Therefore, PROPANE can provide a
                 constrained device with sufficient imagery data to
                 cover a future user's viewpoints, thereby minimizing
                 the impact of transmission delay and jitter. PROPANE
                 has been implemented and compared to two baseline
                 remote rendering schemes. The evaluation results show
                 that the proposed technique outperforms the selected
                 and closely related existing schemes by minimizing the
                 response time while not limiting the user to predefined
                 paths as opposed to previous protocols.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2014:FEM,
  author =       "Xiangyu Wang and Yong Rui and Mohan Kankanhalli",
  title =        "{Up-Fusion}: an Evolving Multimedia Fusion Method",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611777",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The amount of multimedia data on the Internet has
                 increased exponentially in the past few decades and
                 this trend is likely to continue. Multimedia content
                 inherently has multiple information sources, therefore
                 effective fusion methods are critical for data analysis
                 and understanding. So far, most of the existing fusion
                 methods are static with respect to time, making it
                 difficult for them to handle the evolving multimedia
                 content. To address this issue, in recent years,
                 several evolving fusion methods were proposed, however,
                 their requirements are difficult to meet, making them
                 useful only in limited applications. In this article,
                 we propose a novel evolving fusion method based on the
                 online portfolio selection theory. The proposed method
                 takes into account the correlation among different
                 information sources and evolves the fusion model when
                 new multimedia data is added. It performs effectively
                 on both crisp and soft decisions without requiring
                 additional context information. Extensive experiments
                 on concept detection and human detection tasks over the
                 TRECVID dataset and surveillance data have been
                 conducted and significantly better performance has been
                 obtained.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2014:EIP,
  author =       "Xinxi Wang and Yi Wang and David Hsu and Ye Wang",
  title =        "Exploration in Interactive Personalized Music
                 Recommendation: a Reinforcement Learning Approach",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2623372",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Current music recommender systems typically act in a
                 greedy manner by recommending songs with the highest
                 user ratings. Greedy recommendation, however, is
                 suboptimal over the long term: it does not actively
                 gather information on user preferences and fails to
                 recommend novel songs that are potentially interesting.
                 A successful recommender system must balance the needs
                 to explore user preferences and to exploit this
                 information for recommendation. This article presents a
                 new approach to music recommendation by formulating
                 this exploration-exploitation trade-off as a
                 reinforcement learning task. To learn user preferences,
                 it uses a Bayesian model that accounts for both audio
                 content and the novelty of recommendations. A
                 piecewise-linear approximation to the model and a
                 variational inference algorithm help to speed up
                 Bayesian inference. One additional benefit of our
                 approach is a single unified model for both music
                 recommendation and playlist generation. We demonstrate
                 the strong potential of the proposed approach with
                 simulation results and a user study.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Katti:2014:OEE,
  author =       "Harish Katti and Anoop Kolar Rajagopal and Mohan
                 Kankanhalli and Ramakrishnan Kalpathi",
  title =        "Online Estimation of Evolving Human Visual Interest",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632284",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Sep 1 12:38:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Regions in video streams attracting human interest
                 contribute significantly to human understanding of the
                 video. Being able to predict salient and informative
                 Regions of Interest (ROIs) through a sequence of eye
                 movements is a challenging problem. Applications such
                 as content-aware retargeting of videos to different
                 aspect ratios while preserving informative regions and
                 smart insertion of dialog (closed-caption text)$^1$
                 into the video stream can significantly be improved
                 using the predicted ROIs. We propose an interactive
                 human-in-the-loop framework to model eye movements and
                 predict visual saliency into yet-unseen frames. Eye
                 tracking and video content are used to model visual
                 attention in a manner that accounts for important
                 eye-gaze characteristics such as temporal
                 discontinuities due to sudden eye movements, noise, and
                 behavioral artifacts. A novel statistical- and
                 algorithm-based method gaze buffering is proposed for
                 eye-gaze analysis and its fusion with content-based
                 features. Our robust saliency prediction is
                 instantiated for two challenging and exciting
                 applications. The first application alters video aspect
                 ratios on-the-fly using content-aware video
                 retargeting, thus making them suitable for a variety of
                 display sizes. The second application dynamically
                 localizes active speakers and places dialog captions
                 on-the-fly in the video stream. Our method ensures that
                 dialogs are faithful to active speaker locations and do
                 not interfere with salient content in the video stream.
                 Our framework naturally accommodates personalisation of
                 the application to suit biases and preferences of
                 individual users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghinea:2014:ISI,
  author =       "Gheorghita Ghinea and Christian Timmerer and Weisi Lin
                 and Stephen Gulliver",
  title =        "Introduction to Special Issue on Multiple Sensorial
                 {(MulSeMedia)} Multimodal Media: Advances and
                 Applications",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661333",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lv:2014:MHF,
  author =       "Zhihan Lv and Alaa Halawani and Shengzhong Feng and
                 Haibo Li and Shafiq Ur R{\'e}hman",
  title =        "Multimodal Hand and Foot Gesture Interaction for
                 Handheld Devices",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2645860",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We present a hand-and-foot-based multimodal
                 interaction approach for handheld devices. Our method
                 combines input modalities (i.e., hand and foot) and
                 provides a coordinated output to both modalities along
                 with audio and video. Human foot gesture is detected
                 and tracked using contour-based template detection
                 (CTD) and Tracking-Learning-Detection (TLD) algorithm.
                 3D foot pose is estimated from passive homography
                 matrix of the camera. 3D stereoscopic and vibrotactile
                 are used to enhance the immersive feeling. We developed
                 a multimodal football game based on the multimodal
                 approach as a proof-of-concept. We confirm our systems
                 user satisfaction through a user study.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Prasad:2014:DVC,
  author =       "Manoj Prasad and Murat Russell and Tracy A. Hammond",
  title =        "Designing Vibrotactile Codes to Communicate Verb
                 Phrases",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637289",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Soldiers, to guard themselves from enemy assault, have
                 to maintain visual and auditory awareness of their
                 environment. Their visual and auditory senses are thus
                 saturated. This makes these channels less usable for
                 communication. The tactile medium of communication with
                 users is appropriate for displaying information in such
                 situations. Research in interpersonal communication
                 among soldiers shows that the most common form of
                 communication between soldiers involves the use of verb
                 phrases. In this article, we have developed a
                 three-by-three tactile display and proposed a method
                 for mapping the components of a verb phrase to two
                 dimensions of tactile codes-shape and waveform.
                 Perception of tactile codes by users depends on the
                 ability of users to distinguish shape and waveform of
                 the code. We have proposed a measure to rate the
                 distinguish-ability of any two shapes and created a
                 graph-based user-centric model using this measure to
                 select distinguishable shapes from a set of all
                 presentable shapes. We conducted two user studies to
                 evaluate the ability of users to perceive tactile
                 information. The results from our first study showed
                 users' ability to perceive tactile shapes, tactile
                 waveforms, and form verb phrases from tactile codes.
                 The recognition accuracy and time taken to distinguish
                 were better when the shapes were selected from the
                 graph model than when shapes were chosen based on
                 intuition. The second user study was conducted to test
                 the performance of users while performing a primary
                 visual task simultaneously with a secondary audio or
                 haptic task. Users were more familiar with perceiving
                 information from an auditory medium than from a haptic
                 medium, which was reflected in their performance. Thus
                 the performance of users in the primary visual task was
                 better while using an audio medium of communication
                 than while using a haptic medium of communication.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Murray:2014:MSE,
  author =       "Niall Murray and Brian Lee and Yuansong Qiao and
                 Gabriel-Miro Muntean",
  title =        "Multiple-Scent Enhanced Multimedia Synchronization",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637293",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This study looked at users' perception of interstream
                 synchronization between audiovisual media and two
                 olfactory streams. The ability to detect skews and the
                 perception and impact of skews on user Quality of
                 Experience (QoE) is analyzed. The olfactory streams are
                 presented with the same skews (i.e., delay) and with
                 variable skews (i.e., jitter and mix of scents). This
                 article reports the limits beyond which
                 desynchronization reduces user-perceived quality
                 levels. Also, a minimum gap between the presentations
                 of consecutive scents is identified, necessary to
                 ensuring enhanced user-perceived quality. There is no
                 evidence (not considering scent type) that overlapping
                 or mixing of scents increases user QoE levels for
                 olfaction-enhanced multimedia.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kroupi:2014:ECP,
  author =       "Eleni Kroupi and Ashkan Yazdani and Jean-Marc Vesin
                 and Touradj Ebrahimi",
  title =        "{EEG} Correlates of Pleasant and Unpleasant Odor
                 Perception",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637287",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Olfaction-enhanced multimedia experience is becoming
                 vital for strengthening the sensation of reality and
                 the quality of user experience. One approach to
                 investigate olfactory perception is to analyze the
                 alterations in brain activity during stimulation with
                 different odors. In this article, the changes in the
                 electroencephalogram (EEG) when perceiving
                 hedonically-different odors are studied. Results of
                 within and across-subject analysis are presented. We
                 show that EEG-based odor classification using brain
                 activity is possible and can be used to automatically
                 recognize odor pleasantness when a subject-specific
                 classifier is trained. However, it is a challenging
                 problem to design a generic classifier.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rainer:2014:GUM,
  author =       "Benjamin Rainer and Christian Timmerer",
  title =        "A Generic Utility Model Representing the Quality of
                 Sensory Experience",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2648429",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Current QoE research is mainly focusing on single
                 modalities (audio, visual) or combinations thereof. In
                 our research, we propose annotating traditional
                 multimedia content with additional sensory effects,
                 such as ambient light, vibration, wind, and olfaction,
                 which could potentially stimulate all human senses.
                 Investigating the influence of individual sensory
                 effects and combinations thereof is important in order
                 to understand how these individual sensory effects
                 influence the Quality of Experience (QoE) as a whole.
                 In this article, we describe the results of such a
                 subjective quality assessment of audio-visual sequences
                 which are annotated with additional sensory effects
                 such as ambient light, wind, and vibration using the
                 MPEG-V standard. The results of this assessment allow
                 us to derive a utility model representing the Quality
                 of Sensory Experience (QuaSE) complementary to existing
                 QoE models described in terms of Quality of Service
                 (QoS) parameters. For validating our proposed utility
                 model, we provide an example instantiation and validate
                 it against results of subjective quality assessments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yuan:2014:UQE,
  author =       "Zhenhui Yuan and Shengyang Chen and Gheorghita Ghinea
                 and Gabriel-Miro Muntean",
  title =        "User Quality of Experience of Mulsemedia
                 Applications",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661329",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "User Quality of Experience (QoE) is of fundamental
                 importance in multimedia applications and has been
                 extensively studied for decades. However, user QoE in
                 the context of the emerging multiple-sensorial media
                 (mulsemedia) services, which involve different media
                 components than the traditional multimedia
                 applications, have not been comprehensively studied.
                 This article presents the results of subjective tests
                 which have investigated user perception of mulsemedia
                 content. In particular, the impact of intensity of
                 certain mulsemedia components including haptic and
                 airflow on user-perceived experience are studied.
                 Results demonstrate that by making use of mulsemedia
                 the overall user enjoyment levels increased by up to
                 77\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Luque:2014:IMS,
  author =       "Francisco Pedro Luque and Iris Galloso and Claudio
                 Feijoo and Carlos Alberto Mart{\'\i}n and Guillermo
                 Cisneros",
  title =        "Integration of Multisensorial Stimuli and Multimodal
                 Interaction in a Hybrid {$3$DTV} System",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617992",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes the integration of
                 multisensorial stimuli and multimodal interaction
                 components into a sports multimedia asset under two
                 dimensions: immersion and interaction. The first
                 dimension comprises a binaural audio system and a set
                 of sensory effects synchronized with the audiovisual
                 content, whereas the second explores interaction
                 through the insertion of interactive 3D objects into
                 the main screen and on-demand presentation of
                 additional information in a second touchscreen. We
                 present an end-to-end solution integrating these
                 components into a hybrid (internet-broadcast)
                 television system using current 3DTV standards. Results
                 from an experimental study analyzing the perceived
                 quality of these stimuli and their influence on the
                 Quality of Experience are presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ghinea:2014:MSA,
  author =       "Gheorghita Ghinea and Christian Timmerer and Weisi Lin
                 and Stephen R. Gulliver",
  title =        "Mulsemedia: State of the Art, Perspectives, and
                 Challenges",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617994",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mulsemedia-multiple sensorial media-captures a wide
                 variety of research efforts and applications. This
                 article presents a historic perspective on mulsemedia
                 work and reviews current developments in the area.
                 These take place across the traditional multimedia
                 spectrum-from virtual reality applications to computer
                 games-as well as efforts in the arts, gastronomy, and
                 therapy, to mention a few. We also describe
                 standardization efforts, via the MPEG-V standard, and
                 identify future developments and exciting challenges
                 the community needs to overcome.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zha:2014:ISI,
  author =       "Zheng-Jun Zha and Lei Zhang and Max M{\"u}hlh{\"a}user
                 and Alan F. Smeaton",
  title =        "Introduction to the Special Issue Best Papers of {ACM
                 Multimedia 2013}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661331",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Fang:2014:DGI,
  author =       "Quan Fang and Jitao Sang and Changsheng Xu",
  title =        "Discovering Geo-Informative Attributes for Location
                 Recognition and Exploration",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2648581",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article considers the problem of automatically
                 discovering geo-informative attributes for location
                 recognition and exploration. The attributes are
                 expected to be both discriminative and representative,
                 which correspond to certain distinctive visual patterns
                 and associate with semantic interpretations. For our
                 solution, we analyze the attribute at the region level.
                 Each segmented region in the training set is assigned a
                 binary latent variable indicating its discriminative
                 capability. A latent learning framework is proposed for
                 discriminative region detection and geo-informative
                 attribute discovery. Moreover, we use user-generated
                 content to obtain the semantic interpretation for the
                 discovered visual attributes. Discriminative and
                 search-based attribute annotation methods are developed
                 for geo-informative attribute interpretation. The
                 proposed approach is evaluated on one challenging
                 dataset including GoogleStreetView and Flickr photos.
                 Experimental results show that (1) geo-informative
                 attributes are discriminative and useful for location
                 recognition; (2) the discovered semantic interpretation
                 is meaningful and can be exploited for further location
                 exploration.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2014:WYB,
  author =       "Luoqi Liu and Junliang Xing and Si Liu and Hui Xu and
                 Xi Zhou and Shuicheng Yan",
  title =        "{``Wow! You Are So Beautiful Today!''}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2659234",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Beauty e-Experts, a fully automatic system for
                 makeover recommendation and synthesis, is developed in
                 this work. The makeover recommendation and synthesis
                 system simultaneously considers many kinds of makeover
                 items on hairstyle and makeup. Given a user-provided
                 frontal face image with short/bound hair and no/light
                 makeup, the Beauty e-Experts system not only recommends
                 the most suitable hairdo and makeup, but also
                 synthesizes the virtual hairdo and makeup effects. To
                 acquire enough knowledge for beauty modeling, we built
                 the Beauty e-Experts Database, which contains 1,505
                 female photos with a variety of attributes annotated
                 with different discrete values. We organize these
                 attributes into two different categories, beauty
                 attributes and beauty-related attributes. Beauty
                 attributes refer to those values that are changeable
                 during the makeover process and thus need to be
                 recommended by the system. Beauty-related attributes
                 are those values that cannot be changed during the
                 makeup process but can help the system to perform
                 recommendation. Based on this Beauty e-Experts Dataset,
                 two problems are addressed for the Beauty e-Experts
                 system: what to recommend and how to wear it, which
                 describes a similar process of selecting hairstyle and
                 cosmetics in daily life. For the what-to-recommend
                 problem, we propose a multiple tree-structured
                 supergraph model to explore the complex relationships
                 among high-level beauty attributes, mid-level
                 beauty-related attributes, and low-level image
                 features. Based on this model, the most compatible
                 beauty attributes for a given facial image can be
                 efficiently inferred. For the how-to-wear-it problem,
                 an effective and efficient facial image synthesis
                 module is designed to seamlessly synthesize the
                 recommended makeovers into the user facial image. We
                 have conducted extensive experiments on testing images
                 of various conditions to evaluate and analyze the
                 proposed system. The experimental results well
                 demonstrate the effectiveness and efficiency of the
                 proposed system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2014:AAS,
  author =       "Hanwang Zhang and Zheng-Jun Zha and Yang Yang and
                 Shuicheng Yan and Yue Gao and Tat-Seng Chua",
  title =        "Attribute-Augmented Semantic Hierarchy: Towards a
                 Unified Framework for Content-Based Image Retrieval",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637291",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel attribute-augmented
                 semantic hierarchy (A$^2$ SH) and demonstrates its
                 effectiveness in bridging both the semantic and
                 intention gaps in content-based image retrieval (CBIR).
                 A$^2$ SH organizes semantic concepts into multiple
                 semantic levels and augments each concept with a set of
                 related attributes. The attributes are used to describe
                 the multiple facets of the concept and act as the
                 intermediate bridge connecting the concept and
                 low-level visual content. An hierarchical semantic
                 similarity function is learned to characterize the
                 semantic similarities among images for retrieval. To
                 better capture user search intent, a hybrid feedback
                 mechanism is developed, which collects hybrid feedback
                 on attributes and images. This feedback is then used to
                 refine the search results based on A$^2$ SH. We use
                 A$^2$ SH as a basis to develop a unified content-based
                 image retrieval system. We conduct extensive
                 experiments on a large-scale dataset of over one
                 million Web images. Experimental results show that the
                 proposed A$^2$ SH can characterize the semantic
                 affinities among images accurately and can shape user
                 search intent quickly, leading to more accurate search
                 results as compared to state-of-the-art CBIR
                 solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2014:SSS,
  author =       "Xin Zhao and Xue Li and Chaoyi Pang and Quan Z. Sheng
                 and Sen Wang and Mao Ye",
  title =        "Structured Streaming Skeleton --- A New Feature for
                 Online Human Gesture Recognition",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2648583",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online human gesture recognition has a wide range of
                 applications in computer vision, especially in
                 human-computer interaction applications. The recent
                 introduction of cost-effective depth cameras brings a
                 new trend of research on body-movement gesture
                 recognition. However, there are two major challenges:
                 (i) how to continuously detect gestures from
                 unsegmented streams, and (ii) how to differentiate
                 different styles of the same gesture from other types
                 of gestures. In this article, we solve these two
                 problems with a new effective and efficient feature
                 extraction method-Structured Streaming Skeleton
                 (SSS)-which uses a dynamic matching approach to
                 construct a feature vector for each frame. Our
                 comprehensive experiments on MSRC-12 Kinect Gesture,
                 Huawei/3DLife-2013, and MSR-Action3D datasets have
                 demonstrated superior performances than the
                 state-of-the-art approaches. We also demonstrate model
                 selection based on the proposed SSS feature, where the
                 classifier of squared loss regression with l$_{2, 1}$
                 norm regularization is a recommended classifier for
                 best performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Carbunar:2014:EFN,
  author =       "Bogdan Carbunar and Rahul Potharaju and Michael Pearce
                 and Venugopal Vasudevan and Michael Needham",
  title =        "Errata for: {A Framework for Network Aware Caching for
                 Video on Demand Systems}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "1s",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661298",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Oct 3 12:44:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  note =         "See \cite{Carbunar:2013:FNA}.",
  abstract =     "Some errors were introduced into this article in the
                 preparation of the final source files. The errors are
                 summarized in the following text and revised pages with
                 the corrected elements indicated in red are provided.
                 The full corrected article can be accessed in the ACM
                 DL, DOI https://doi.org/10.1145/2501643.2501652 -Page
                 8: New Figure 6(a) -Page 16: New Figures 8(a), 8(b),
                 and 9(a) -Page 17: New Figure 10(b) -Page 18: New
                 Figures 11 and 12; corrected text reference -Page 19:
                 Final sentence deleted",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2014:AGS,
  author =       "Ying Zhang and Luming Zhang and Roger Zimmermann",
  title =        "Aesthetics-Guided Summarization from Multiple User
                 Generated Videos",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2659520",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In recent years, with the rapid development of camera
                 technology and portable devices, we have witnessed a
                 flourish of user generated videos, which are gradually
                 reshaping the traditional professional video oriented
                 media market. The volume of user generated videos in
                 repositories is increasing at a rapid rate. In today's
                 video retrieval systems, a simple query will return
                 many videos which seriously increase the viewing
                 burden. To manage these video retrievals and provide
                 viewers with an efficient way to browse, we introduce a
                 system to automatically generate a summarization from
                 multiple user generated videos and present their
                 salience to viewers in an enjoyable manner. Among
                 multiple consumer videos, we find their qualities to be
                 highly diverse due to various factors such as a
                 photographer's experience or environmental conditions
                 at the time of capture. Such quality inspires us to
                 include a video quality evaluation component into the
                 video summarization since videos with poor qualities
                 can seriously degrade the viewing experience. We first
                 propose a probabilistic model to evaluate the aesthetic
                 quality of each user generated video. This model
                 compares the rich aesthetics information from several
                 well-known photo databases with generic unlabeled
                 consumer videos, under a human perception component
                 indicating the correlation between a video and its
                 constituting frames. Subjective studies were carried
                 out with the results indicating that our method is
                 reliable. Then a novel graph-based formulation is
                 proposed for the multi-video summarization task.
                 Desirable summarization criteria is incorporated as the
                 graph attributes and the problem is solved through a
                 dynamic programming framework. Comparisons with several
                 state-of-the-art methods demonstrate that our algorithm
                 performs better than other methods in generating a
                 skimming video in preserving the essential scenes from
                 the original multiple input videos, with smooth
                 transitions among consecutive segments and appealing
                 aesthetics overall.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Calagari:2014:AAL,
  author =       "Kiana Calagari and Mohammad Reza Pakravan and Shervin
                 Shirmohammadi and Mohamed Hefeeda",
  title =        "{ALP}: Adaptive Loss Protection Scheme with Constant
                 Overhead for Interactive Video Applications",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656203",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "There has been an increasing demand for interactive
                 video transmission over the Internet for applications
                 such as video conferencing, video calls, and
                 telepresence applications. These applications are
                 increasingly moving towards providing High Definition
                 (HD) video quality to users. A key challenge in these
                 applications is to preserve the quality of video when
                 it is transported over best-effort networks that do not
                 guarantee lossless transport of video packets. In such
                 conditions, it is important to protect the transmitted
                 video by using intelligent and adaptive protection
                 schemes. Applications such as HD video conferencing
                 require live interaction among participants, which
                 limits the overall delay the system can tolerate.
                 Therefore, the protection scheme should add little or
                 no extra delay to video transport. We propose a novel
                 Adaptive Loss Protection (ALP) scheme for interactive
                 HD video applications such as video conferencing and
                 video chats. This scheme adds negligible delay to the
                 transmission process and is shown to achieve better
                 quality than other schemes in lossy networks. The
                 proposed ALP scheme adaptively applies four different
                 protection modes to cope with the dynamic network
                 conditions, which results in high video quality in all
                 network conditions. Our ALP scheme consists of four
                 protection modes; each of these modes utilizes a
                 protection method. Two of the modes rely on the
                 state-of-the-art protection methods, and we propose a
                 new Integrated Loss Protection (ILP) method for the
                 other two modes. In the ILP method we integrate three
                 factors for distributing the protection among packets.
                 These three factors are error propagation, region of
                 interest and header information. In order to decide
                 when to switch between the protection modes, a new
                 metric is proposed based on the effectiveness of each
                 mode in performing protection, rather than just
                 considering network statistics such as packet loss
                 rate. Results show that by using this metric not only
                 the overall quality will be improved but also the
                 variance of quality will decrease. One of the main
                 advantages of the proposed ALP scheme is that it does
                 not increase the bit rate overhead in poor network
                 conditions. Our results show a significant gain in
                 video quality, up to 3dB PSNR improvement is achieved
                 using our scheme, compared to protecting all packets
                 equally with the same amount of overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ren:2014:BGO,
  author =       "Dongni Ren and Yisheng Xu and S.-H. Gary Chan",
  title =        "Beyond {1Mbps} Global Overlay Live Streaming: The Case
                 of Proxy Helpers",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "26:1--26:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2652485",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In order to provide live streaming over the global
                 Internet, a content provider often deploys an overlay
                 network consisting of distributed proxies placed close
                 to user pools. Streaming of multi-Mbps video over such
                 an overlay is challenging because of bandwidth
                 bottlenecks in paths. To effectively overcome these
                 bottlenecks, we consider employing proxy helpers in the
                 overlay to provide rich path diversity. The helpers do
                 not have any attached users, and hence may forward
                 partial video streams (or not at all) if necessary. In
                 this way, the helpers serve as stepping stones to
                 supply full streams to the servers. The issue is how to
                 involve the helpers in the overlay to achieve low
                 streaming delay meeting a certain high streaming
                 bitrate requirement. To address the issue, we first
                 formulate the problem which captures various delay and
                 bandwidth components, and show that it is NP-hard. We
                 then propose an efficient algorithm called
                 Stepping-Stones (SS) which can be efficiently
                 implemented in a controller. Given the encouraging
                 simulation results, we develop a novel streaming
                 testbed for SS and explore, through sets of Internet
                 experiments, the effectiveness of helpers to achieve
                 high bitrate (multi-Mbps) global live streaming. In our
                 experiments, proxies are deployed with a reasonably
                 wide global footprint. We collect more than a hundred
                 hours of streaming traces with bitrate ranging from
                 500kbps to a few Mbps. Our experimental data validates
                 that helpers indeed play an important role in achieving
                 high bitrate in today's Internet. Global multi-Mbps
                 streaming is possible due to their multihop and
                 multipath advantages. Our experimental trials and data
                 also provide valuable insights on the design of a
                 global push-based streaming network. There are strong
                 benefits of using proxy helpers to achieve high bitrate
                 and low delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Qian:2014:SEC,
  author =       "Shengsheng Qian and Tianzhu Zhang and Changsheng Xu
                 and M. Shamim Hossain",
  title =        "Social Event Classification via Boosted Multimodal
                 Supervised Latent {Dirichlet} Allocation",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "27:1--27:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2659521",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the rapidly increasing popularity of social media
                 sites (e.g., Flickr, YouTube, and Facebook), it is
                 convenient for users to share their own comments on
                 many social events, which successfully facilitates
                 social event generation, sharing and propagation and
                 results in a large amount of user-contributed media
                 data (e.g., images, videos, and text) for a wide
                 variety of real-world events of different types and
                 scales. As a consequence, it has become more and more
                 difficult to exactly find the interesting events from
                 massive social media data, which is useful to browse,
                 search and monitor social events by users or
                 governments. To deal with these issues, we propose a
                 novel boosted multimodal supervised Latent Dirichlet
                 Allocation (BMM-SLDA) for social event classification
                 by integrating a supervised topic model, denoted as
                 multi-modal supervised Latent Dirichlet Allocation
                 (mm-SLDA), in the boosting framework. Our proposed
                 BMM-SLDA has a number of advantages. (1) Our mm-SLDA
                 can effectively exploit the multimodality and the
                 multiclass property of social events jointly, and make
                 use of the supervised category label information to
                 classify multiclass social event directly. (2) It is
                 suitable for large-scale data analysis by utilizing
                 boosting weighted sampling strategy to iteratively
                 select a small subset of data to efficiently train the
                 corresponding topic models. (3) It effectively exploits
                 social event structure by the document weight
                 distribution with classification error and can
                 iteratively learn new topic model to correct the
                 previously misclassified event documents. We evaluate
                 our BMM-SLDA on a real world dataset and show extensive
                 experimental results, which demonstrate that our model
                 outperforms state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ye:2014:OBL,
  author =       "Jun Ye and Kien A. Hua",
  title =        "Octree-Based {$3$D} Logic and Computation of Spatial
                 Relationships in Live Video Query Processing",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "28:1--28:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2645864",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Live video computing (LVC) on distributed smart
                 cameras has many important applications; and a database
                 approach based on a Live Video DataBase Management
                 System (LVDBMS) has shown to be effective for general
                 LVC application development. The performance of such a
                 database system relies on accurate interpretation of
                 spatial relationships among objects in the live video.
                 With the popularity of affordable depth cameras, 3D
                 spatial computation techniques have been applied.
                 However, the 3D object models currently used are
                 expensive to compute, and offer limited scalability. We
                 address this drawback in this article by proposing an
                 octree-based 3D spatial logic and presenting algorithms
                 for computing 3D spatial relationships using depth
                 cameras. To support continuous query processing on live
                 video streams, we also develop a GPU-based
                 implementation of the proposed technique to further
                 enhance scalability for real-time applications.
                 Extensive performance studies based on a public RGB-D
                 dataset as well as the LVDBMS prototype demonstrates
                 the correctness and efficiency of our techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yin:2014:STT,
  author =       "Yifang Yin and Zhijie Shen and Luming Zhang and Roger
                 Zimmermann",
  title =        "Spatial-Temporal Tag Mining for Automatic Geospatial
                 Video Annotation",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "29:1--29:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2658981",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Videos are increasingly geotagged and used in
                 practical and powerful GIS applications. However, video
                 search and management operations are typically
                 supported by manual textual annotations, which are
                 subjective and laborious. Therefore, research has been
                 conducted to automate or semi-automate this process.
                 Since a diverse vocabulary for video annotations is of
                 paramount importance towards good search results, this
                 article proposes to leverage crowdsourced data from
                 social multimedia applications that host tags of
                 diverse semantics to build a spatio-temporal tag
                 repository, consequently acting as input to our
                 auto-annotation approach. In particular, to build the
                 tag store, we retrieve the necessary data from several
                 social multimedia applications, mine both the spatial
                 and temporal features of the tags, and then refine and
                 index them accordingly. To better integrate the tag
                 repository, we extend our previous approach by
                 leveraging the temporal characteristics of videos as
                 well. Moreover, we set up additional ranking criteria
                 on the basis of tag similarity, popularity and location
                 bias. Experimental results demonstrate that, by making
                 use of such a tag repository, the generated tags have a
                 wide range of semantics, and the resulting rankings are
                 more consistent with human perception.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2014:LAM,
  author =       "Chih-Wei Lin and Kuan-Wen Chen and Shen-Chi Chen and
                 Cheng-Wu Chen and Yi-Ping Hung",
  title =        "Large-Area, Multilayered, and High-Resolution Visual
                 Monitoring Using a Dual-Camera System",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "30:1--30:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2645862",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Large-area, high-resolution visual monitoring systems
                 are indispensable in surveillance applications. To
                 construct such systems, high-quality image capture and
                 display devices are required. Whereas high-quality
                 displays have rapidly developed, as exemplified by the
                 announcement of the 85-inch 4K ultrahigh-definition TV
                 by Samsung at the 2013 Consumer Electronics Show (CES),
                 high-resolution surveillance cameras have progressed
                 slowly and remain not widely used compared with
                 displays. In this study, we designed an innovative
                 framework, using a dual-camera system comprising a
                 wide-angle fixed camera and a high-resolution
                 pan-tilt-zoom (PTZ) camera to construct a large-area,
                 multilayered, and high-resolution visual monitoring
                 system that features multiresolution monitoring of
                 moving objects. First, we developed a novel calibration
                 approach to estimate the relationship between the two
                 cameras and calibrate the PTZ camera. The PTZ camera
                 was calibrated based on the consistent property of
                 distinct pan-tilt angle at various zooming factors,
                 accelerating the calibration process without affecting
                 accuracy; this calibration process has not been
                 reported previously. After calibrating the dual-camera
                 system, we used the PTZ camera and synthesized a
                 large-area and high-resolution background image. When
                 foreground targets were detected in the images captured
                 by the wide-angle camera, the PTZ camera was controlled
                 to continuously track the user-selected target. Last,
                 we integrated preconstructed high-resolution background
                 and low-resolution foreground images captured using the
                 wide-angle camera and the high-resolution foreground
                 image captured using the PTZ camera to generate a
                 large-area, multilayered, and high-resolution view of
                 the scene.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Deng:2014:TFP,
  author =       "Zhengyu Deng and Ming Yan and Jitao Sang and
                 Changsheng Xu",
  title =        "{Twitter} is Faster: Personalized Time-Aware Video
                 Recommendation from {Twitter} to {YouTube}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637285",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Traditional personalized video recommendation methods
                 focus on utilizing user profile or user history
                 behaviors to model user interests, which follows a
                 static strategy and fails to capture the swift shift of
                 the short-term interests of users. According to our
                 cross-platform data analysis, the information emergence
                 and propagation is faster in social textual
                 stream-based platforms than that in multimedia sharing
                 platforms at micro user level. Inspired by this, we
                 propose a dynamic user modeling strategy to tackle
                 personalized video recommendation issues in the
                 multimedia sharing platform YouTube, by transferring
                 knowledge from the social textual stream-based platform
                 Twitter. In particular, the cross-platform video
                 recommendation strategy is divided into two steps. (1)
                 Real-time hot topic detection: the hot topics that
                 users are currently following are extracted from users'
                 tweets, which are utilized to obtain the related videos
                 in YouTube. (2) Time-aware video recommendation: for
                 the target user in YouTube, the obtained videos are
                 ranked by considering the user profile in YouTube, time
                 factor, and quality factor to generate the final
                 recommendation list. In this way, the short-term (hot
                 topics) and long-term (user profile) interests of users
                 are jointly considered. Carefully designed experiments
                 have demonstrated the advantages of the proposed
                 method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2014:SFV,
  author =       "Yongtao Hu and Jan Kautz and Yizhou Yu and Wenping
                 Wang",
  title =        "Speaker-Following Video Subtitles",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2632111",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 7 17:48:10 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We propose a new method for improving the presentation
                 of subtitles in video (e.g., TV and movies). With
                 conventional subtitles, the viewer has to constantly
                 look away from the main viewing area to read the
                 subtitles at the bottom of the screen, which disrupts
                 the viewing experience and causes unnecessary
                 eyestrain. Our method places on-screen subtitles next
                 to the respective speakers to allow the viewer to
                 follow the visual content while simultaneously reading
                 the subtitles. We use novel identification algorithms
                 to detect the speakers based on audio and visual
                 information. Then the placement of the subtitles is
                 determined using global optimization. A comprehensive
                 usability study indicated that our subtitle placement
                 method outperformed both conventional fixed-position
                 subtitling and another previous dynamic subtitling
                 method in terms of enhancing the overall viewing
                 experience and reducing eyestrain.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2015:ISI,
  author =       "Kuan-Ta Chen and Songqing Chen and Wei Tsang Ooi",
  title =        "Introduction to the Special Issue on {MMSys 2014} and
                 {NOSSDAV 2014}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "41:1--41:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717509",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Schaber:2015:CAM,
  author =       "Philipp Schaber and Stephan Kopf and Sina Wetzel and
                 Tyler Ballast and Christoph Wesch and Wolfgang
                 Effelsberg",
  title =        "{CamMark}: Analyzing, Modeling, and Simulating
                 Artifacts in Camcorder Copies",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "42:1--42:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700295",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "To support the development of any system that includes
                 the generation and evaluation of camcorder copies, as
                 well as to provide a common benchmark for robustness
                 against camcorder copies, we present a tool to simulate
                 digital video re-acquisition using a digital video
                 camera. By resampling each video frame, we simulate the
                 typical artifacts occurring in a camcorder copy:
                 geometric modifications (aspect ratio changes,
                 cropping, perspective and lens distortion), temporal
                 sampling artifacts (due to different frame rates,
                 shutter speeds, rolling shutters, or playback), spatial
                 and color subsampling (rescaling, filtering, Bayer
                 color filter array), and processing steps (automatic
                 gain control, automatic white balance). We also support
                 the simulation of camera movement (e.g., a hand-held
                 camera) and background insertion. Furthermore, we allow
                 for an easy setup and calibration of all the simulated
                 artifacts, using sample/reference pairs of images and
                 videos. Specifically temporal subsampling effects are
                 analyzed in detail to create realistic frame blending
                 artifacts in the simulated copies. We carefully
                 evaluated our entire camcorder simulation system and
                 found that the models we developed describe and match
                 the real artifacts quite well.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Toni:2015:OSA,
  author =       "Laura Toni and Ramon Aparicio-Pardo and Karine Pires
                 and Gwendal Simon and Alberto Blanc and Pascal
                 Frossard",
  title =        "Optimal Selection of Adaptive Streaming
                 Representations",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "43:1--43:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700294",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Adaptive streaming addresses the increasing and
                 heterogeneous demand of multimedia content over the
                 Internet by offering several encoded versions for each
                 video sequence. Each version (or representation) is
                 characterized by a resolution and a bit rate, and it is
                 aimed at a specific set of users, like TV or mobile
                 phone clients. While most existing works on adaptive
                 streaming deal with effective playout-buffer control
                 strategies on the client side, in this article we take
                 a providers' perspective and propose solutions to
                 improve user satisfaction by optimizing the set of
                 available representations. We formulate an integer
                 linear program that maximizes users' average
                 satisfaction, taking into account network dynamics,
                 type of video content, and user population
                 characteristics. The solution of the optimization is a
                 set of encoding parameters corresponding to the
                 representations set that maximizes user satisfaction.
                 We evaluate this solution by simulating multiple
                 adaptive streaming sessions characterized by realistic
                 network statistics, showing that the proposed solution
                 outperforms commonly used vendor recommendations, in
                 terms of user satisfaction but also in terms of
                 fairness and outage probability. The simulation results
                 show that video content information as well as network
                 constraints and users' statistics play a crucial role
                 in selecting proper encoding parameters to provide
                 fairness among users and to reduce network resource
                 usage. We finally propose a few theoretical guidelines
                 that can be used, in realistic settings, to choose the
                 encoding parameters based on the user characteristics,
                 the network capacity and the type of video content.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2015:ADF,
  author =       "Liang Chen and Yipeng Zhou and Dah Ming Chiu",
  title =        "Analysis and Detection of Fake Views in Online Video
                 Services",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "44:1--44:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700290",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online video-on-demand(VoD) services invariably
                 maintain a view count for each video they serve, and it
                 has become an important currency for various
                 stakeholders, from viewers, to content owners,
                 advertizers, and the online service providers
                 themselves. There is often significant financial
                 incentive to use a robot (or a botnet) to artificially
                 create fake views. How can we detect fake views? Can we
                 detect them (and stop them) efficiently? What is the
                 extent of fake views with current VoD service
                 providers? These are the questions we study in this
                 article. We develop some algorithms and show that they
                 are quite effective for this problem.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Song:2015:SVT,
  author =       "Minseok Song and Yeongju Lee and Jinhan Park",
  title =        "Scheduling a Video Transcoding Server to Save Energy",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "45:1--45:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700282",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Recent popular streaming services such as TV
                 Everywhere, N-Screen, and dynamic adaptive streaming
                 over HTTP (DASH) need to deliver content to the wide
                 range of devices, requiring video content to be
                 transcoded into different versions. Transcoding tasks
                 require a lot of computation, and each task typically
                 has its own real-time constraint. These make it
                 difficult to manage transcoding, but the more efficient
                 use of energy in servers is an imperative. We
                 characterize transcoding workloads in terms of
                 deadlines and computation times, and propose a new
                 dynamic voltage and frequency scaling (DVFS) scheme
                 that allocates a frequency and a workload to each CPU
                 with the aim of minimizing power consumption while
                 meeting all transcoding deadlines. This scheme has been
                 simulated, and also implemented in a Linux transcoding
                 server, in which a frontend node distributes
                 transcoding requests to heterogeneous backend nodes.
                 This required a new protocol for communication between
                 nodes, a DVFS management scheme to reduce power
                 consumption and thread management and scheduling
                 schemes which ensure that transcoding deadlines are
                 met. Power measurements show that this approach can
                 reduce system-wide energy consumption by 17\% to 31\%,
                 compared with the Linux Ondemand governor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Langroodi:2015:DCA,
  author =       "Mohsen Jamali Langroodi and Joseph Peters and Shervin
                 Shirmohammadi",
  title =        "Decoder-Complexity-Aware Encoding of Motion
                 Compensation for Multiple Heterogeneous Receivers",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "46:1--46:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700300",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For mobile multimedia systems, advances in battery
                 technology have been much slower than those in memory,
                 graphics, and processing power, making power
                 consumption a major concern in mobile systems. The
                 computational complexity of video codecs, which
                 consists of CPU operations and memory accesses, is one
                 of the main factors affecting power consumption. In
                 this article, we propose a method that achieves
                 near-optimal video quality while respecting
                 user-defined bounds on the complexity needed to decode
                 a video. We specifically focus on the motion
                 compensation process, including motion vector
                 prediction and interpolation, because it is the single
                 largest component of computation-based power
                 consumption. We start by formulating a scenario with a
                 single receiver as a rate-distortion optimization
                 problem and we develop an efficient
                 decoder-complexity-aware video encoding method to solve
                 it. Then we extend our approach to handle multiple
                 heterogeneous receivers, each with a different
                 complexity requirement. We test our method
                 experimentally using the H.264 standard for the single
                 receiver scenario and the H.264 SVC extension for the
                 multiple receiver scenario. Our experimental results
                 show that our method can achieve up to 97\% of the
                 optimal solution value in the single receiver scenario,
                 and an average of 97\% of the optimal solution value in
                 the multiple receiver scenario. Furthermore, our tests
                 with actual power measurements show a power saving of
                 up to 23\% at the decoder when the complexity threshold
                 is halved in the encoder.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2015:TAT,
  author =       "Shannon Chen and Zhenhuan Gao and Klara Nahrstedt and
                 Indranil Gupta",
  title =        "{$3$DTI} Amphitheater: Towards {$3$DTI} Broadcasting",
  journal =      j-TOMM,
  volume =       "11",
  number =       "2s",
  pages =        "47:1--47:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700297",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 25 17:56:15 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "3DTI Amphitheater is a live broadcasting system for
                 dissemination of 3DTI (3D Tele-immersive) content. The
                 virtual environment constructed by the system mimics an
                 amphitheater in the real world, where performers
                 interact with each other in the central circular stage,
                 and the audience is placed in virtual seats that
                 surround the stage. Users of the Amphitheater can be
                 geographically dispersed and the streams created by the
                 performer sites are disseminated in a P2P network among
                 the participants. To deal with the high bandwidth
                 demand and strict latency bound of the service, we
                 identify the hierarchical priority of streams in
                 construction of the content dissemination forest.
                 Result shows that the Amphitheater outperforms prior
                 3DTI systems by boosting the application QoS by a
                 factor of 2.8 while sustaining the same hundred-scale
                 audience group.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2015:PMV,
  author =       "Ke Chen and Zhong Zhou and Wei Wu",
  title =        "Progressive Motion Vector Clustering for Motion
                 Estimation and Auxiliary Tracking",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700296",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The motion vector similarity between neighboring
                 blocks is widely used in motion estimation algorithms.
                 However, for nonneighboring blocks, they may also have
                 similar motions due to close depths or belonging to the
                 same object inside the scene. Therefore, the motion
                 vectors usually have several kinds of patterns, which
                 reveal a clustering structure. In this article, we
                 propose a progressive clustering algorithm, which
                 periodically counts the motion vectors of the past
                 blocks to make incremental clustering statistics. These
                 statistics are used as the motion vector predictors for
                 the following blocks. It is proved to be much more
                 efficient for one block to find the best-matching
                 candidate with the predictors. We also design the
                 clustering based search with CUDA for GPU acceleration.
                 Another interesting application of the clustering
                 statistics is persistent static object tracking. Based
                 on the statistics, several auxiliary tracking areas are
                 created to guide the object tracking. Even when the
                 target object has significant changes in appearance or
                 it disappears occasionally, its position still can be
                 predicted. The experiments on Xiph.org Video Test Media
                 dataset illustrate that our clustering based search
                 algorithm outperforms the mainstream and some
                 state-of-the-art motion estimation algorithms. It is 33
                 times faster on average than the full search algorithm
                 with only slightly higher mean-square error values in
                 the experiments. The tracking results show that the
                 auxiliary tracking areas help to locate the target
                 object effectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2015:HFM,
  author =       "Liquan Shen and Ping An and Zhaoyang Zhang and
                 Qianqian Hu and Zhengchuan Chen",
  title =        "A {$3$D--HEVC} Fast Mode Decision Algorithm for
                 Real-Time Applications",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700298",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "3D High Efficiency Video Coding (3D-HEVC) is an
                 extension of the HEVC standard for coding of multiview
                 videos and depth maps. It inherits the same quadtree
                 coding structure as HEVC for both components, which
                 allows recursively splitting into four equal-sized
                 coding units (CU). One of 11 different prediction modes
                 is chosen to code a CU in inter-frames. Similar to the
                 joint model of H.264/AVC, the mode decision process in
                 HM (reference software of HEVC) is performed using all
                 the possible depth levels and prediction modes to find
                 the one with the least rate distortion cost using a
                 Lagrange multiplier. Furthermore, both motion
                 estimation and disparity estimation need to be
                 performed in the encoding process of 3D-HEVC. Those
                 tools achieve high coding efficiency, but lead to a
                 significant computational complexity. In this article,
                 we propose a fast mode decision algorithm for 3D-HEVC.
                 Since multiview videos and their associated depth maps
                 represent the same scene, at the same time instant,
                 their prediction modes are closely linked. Furthermore,
                 the prediction information of a CU at the depth level X
                 is strongly related to that of its parent CU at the
                 depth level X-1 in the quadtree coding structure of
                 HEVC since two corresponding CUs from two neighboring
                 depth levels share similar video characteristics. The
                 proposed algorithm jointly exploits the inter-view
                 coding mode correlation, the inter-component
                 (texture-depth) correlation and the inter-level
                 correlation in the quadtree structure of 3D-HEVC.
                 Experimental results show that our algorithm saves 66\%
                 encoder runtime on average with only a 0.2\% BD-Rate
                 increase on coded views and 1.3\% BD-Rate increase on
                 synthesized views.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2015:BML,
  author =       "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu and
                 Ming-Hsuan Yang",
  title =        "Boosted Multifeature Learning for Cross-Domain
                 Transfer",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700286",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Conventional learning algorithm assumes that the
                 training data and test data share a common
                 distribution. However, this assumption will greatly
                 hinder the practical application of the learned model
                 for cross-domain data analysis in multimedia. To deal
                 with this issue, transfer learning based technology
                 should be adopted. As a typical version of transfer
                 learning, domain adaption has been extensively studied
                 recently due to its theoretical value and practical
                 interest. In this article, we propose a boosted
                 multifeature learning (BMFL) approach to iteratively
                 learn multiple representations within a boosting
                 procedure for unsupervised domain adaption. The
                 proposed BMFL method has a number of properties. (1) It
                 reuses all instances with different weights assigned by
                 the previous boosting iteration and avoids discarding
                 labeled instances as in conventional methods. (2) It
                 models the instance weight distribution effectively by
                 considering the classification error and the domain
                 similarity, which facilitates learning new feature
                 representation to correct the previously misclassified
                 instances. (3) It learns multiple different feature
                 representations to effectively bridge the source and
                 target domains. We evaluate the BMFL by comparing its
                 performance on three applications: image
                 classification, sentiment classification and spam
                 filtering. Extensive experimental results demonstrate
                 that the proposed BMFL algorithm performs favorably
                 against state-of-the-art domain adaption methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2015:DVS,
  author =       "Pei-Yu Lin",
  title =        "Double Verification Secret Sharing Mechanism Based on
                 Adaptive Pixel Pair Matching",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700291",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Verifiability is essential for the secret sharing
                 approach, which allows the involved participants to
                 detect cheaters during the secret retrieval process. In
                 this article, we propose a double verification secret
                 sharing (DVSS) mechanism that can not only prevent
                 fraudulent participants but also satisfy the
                 requirements of secret payload, camouflage, image
                 fidelity and lossless revealed secret. DVSS offers
                 double verification process to enhance the cheater
                 detectability; experimental results reveal that the
                 designed scheme can share larger secret capacity and
                 retain superior image quality than the related secret
                 sharing methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2015:INB,
  author =       "Shuang Wang and Shuqiang Jiang",
  title =        "{INSTRE}: a New Benchmark for Instance-Level Object
                 Retrieval and Recognition",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700292",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Over the last several decades, researches on visual
                 object retrieval and recognition have achieved fast and
                 remarkable success. However, while the category-level
                 tasks prevail in the community, the instance-level
                 tasks (especially recognition) have not yet received
                 adequate focuses. Applications such as content-based
                 search engine and robot vision systems have alerted the
                 awareness to bring instance-level tasks into a more
                 realistic and challenging scenario. Motivated by the
                 limited scope of existing instance-level datasets, in
                 this article we propose a new benchmark for
                 INSTance-level visual object REtrieval and REcognition
                 (INSTRE). Compared with existing datasets, INSTRE has
                 the following major properties: (1) balanced data
                 scale, (2) more diverse intraclass instance variations,
                 (3) cluttered and less contextual backgrounds, (4)
                 object localization annotation for each image, (5)
                 well-manipulated double-labelled images for measuring
                 multiple object (within one image) case. We will
                 quantify and visualize the merits of INSTRE data, and
                 extensively compare them against existing datasets.
                 Then on INSTRE, we comprehensively evaluate several
                 popular algorithms to large-scale object retrieval
                 problem with multiple evaluation metrics. Experimental
                 results show that all the methods suffer a performance
                 drop on INSTRE, proving that this field still remains a
                 challenging problem. Finally we integrate these
                 algorithms into a simple yet efficient scheme for
                 recognition and compare it with classification-based
                 methods. Importantly, we introduce the realistic
                 multiobjects recognition problem. All experiments are
                 conducted in both single object case and multiple
                 objects case.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lathey:2015:IEE,
  author =       "Ankita Lathey and Pradeep K. Atrey",
  title =        "Image Enhancement in Encrypted Domain over Cloud",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2656205",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Cloud-based multimedia systems are becoming
                 increasingly common. These systems offer not only
                 storage facility, but also high-end computing
                 infrastructure which can be used to process data for
                 various analysis tasks ranging from low-level data
                 quality enhancement to high-level activity and behavior
                 identification operations. However, cloud data centers,
                 being third party servers, are often prone to
                 information leakage, raising security and privacy
                 concerns. In this article, we present a Shamir's secret
                 sharing based method to enhance the quality of
                 encrypted image data over cloud. Using the proposed
                 method we show that several image enhancement
                 operations such as noise removal, antialiasing, edge
                 and contrast enhancement, and dehazing can be performed
                 in encrypted domain with near-zero loss in accuracy and
                 minimal computation and data overhead. Moreover, the
                 proposed method is proven to be information
                 theoretically secure.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yin:2015:CVC,
  author =       "Yifang Yin and Beomjoo Seo and Roger Zimmermann",
  title =        "Content vs. Context: Visual and Geographic Information
                 Use in Video Landmark Retrieval",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700287",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Due to the ubiquity of sensor-equipped smartphones, it
                 has become increasingly feasible for users to capture
                 videos together with associated geographic metadata,
                 for example the location and the orientation of the
                 camera. Such contextual information creates new
                 opportunities for the organization and retrieval of
                 geo-referenced videos. In this study we explore the
                 task of landmark retrieval through the analysis of two
                 types of state-of-the-art techniques, namely
                 media-content-based and geocontext-based retrievals.
                 For the content-based method, we choose the Spatial
                 Pyramid Matching (SPM) approach combined with two
                 advanced coding methods: Sparse Coding (SC) and
                 Locality-Constrained Linear Coding (LLC). For the
                 geo-based method, we present the Geo Landmark
                 Visibility Determination (GeoLVD) approach which
                 computes the visibility of a landmark based on
                 intersections of a camera's field-of-view (FOV) and the
                 landmark's geometric information available from
                 Geographic Information Systems (GIS) and services. We
                 first compare the retrieval results of the two methods,
                 and discuss the strengths and weaknesses of each
                 approach in terms of precision, recall and execution
                 time. Next we analyze the factors that affect the
                 effectiveness for the content-based and the geo-based
                 methods, respectively. Finally we propose a hybrid
                 retrieval method based on the integration of the visual
                 (content) and geographic (context) information, which
                 is shown to achieve significant improvements in our
                 experiments. We believe that the results and
                 observations in this work will enlighten the design of
                 future geo-referenced video retrieval systems, improve
                 our understanding of selecting the most appropriate
                 visual features for indexing and searching, and help in
                 selecting between the most suitable methods for
                 retrieval based on different conditions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2015:RCI,
  author =       "Hong-Ying Yang and Xiang-Yang Wang and Pan-Pan Niu and
                 Ai-Long Wang",
  title =        "Robust Color Image Watermarking Using Geometric
                 Invariant Quaternion Polar Harmonic Transform",
  journal =      j-TOMM,
  volume =       "11",
  number =       "3",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700299",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Feb 5 17:03:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "It is a challenging work to design a robust color
                 image watermarking scheme against geometric
                 distortions. Moments and moment invariants have become
                 a powerful tool in robust image watermarking owing to
                 their image description capability and geometric
                 invariance property. However, the existing moment-based
                 watermarking schemes were mainly designed for gray
                 images but not for color images, and detection quality
                 and robustness will be lowered when watermark is
                 directly embedded into the luminance component or three
                 color channels of color images. Furthermore, the
                 imperceptibility of the embedded watermark is not well
                 guaranteed. Based on algebra of quaternions and polar
                 harmonic transform (PHT), we introduced the quaternion
                 polar harmonic transform (QPHT) for invariant color
                 image watermarking in this article, which can be seen
                 as the generalization of PHT for gray-level images. It
                 is shown that the QPHT can be obtained from the PHT of
                 each color channel. We derived and analyzed the
                 rotation, scaling, and translation (RST) invariant
                 property of QPHT. We also discussed the problem of
                 color image watermarking using QPHT. Experimental
                 results are provided to illustrate the efficiency of
                 the proposed color image watermarking against geometric
                 distortions and common image processing operations
                 (including color attacks).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Krishnappa:2015:CCV,
  author =       "Dilip Kumar Krishnappa and Michael Zink and Carsten
                 Griwodz and P{\aa}l Halvorsen",
  title =        "Cache-Centric Video Recommendation: an Approach to
                 Improve the Efficiency of {YouTube} Caches",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "48:1--48:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716310",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we take advantage of the user
                 behavior of requesting videos from the top of the
                 related list provided by YouTube to improve the
                 performance of YouTube caches. We recommend that local
                 caches reorder the related lists associated with
                 YouTube videos, presenting the cached content above
                 noncached content. We argue that the likelihood that
                 viewers select content from the top of the related list
                 is higher than selection from the bottom, and pushing
                 contents already in the cache to the top of the related
                 list would increase the likelihood of choosing cached
                 content. To verify that the position on the list really
                 is the selection criterion more dominant than the
                 content itself, we conduct a user study with 40
                 YouTube-using volunteers who were presented with random
                 related lists in their everyday YouTube use. After
                 confirming our assumption, we analyze the benefits of
                 our approach by an investigation that is based on two
                 traces collected from a university campus. Our analysis
                 shows that the proposed reordering approach for related
                 lists would lead to a 2 to 5 times increase in cache
                 hit rate compared to an approach without reordering the
                 related list. This increase in hit rate would lead to
                 reduction in server load and backend bandwidth usage,
                 which in turn reduces the latency in streaming the
                 video requested by the viewer and has the potential to
                 improve the overall performance of YouTube's content
                 distribution system. An analysis of YouTube's
                 recommendation system reveals that related lists are
                 created from a small pool of videos, which increases
                 the potential for caching content from related lists
                 and reordering based on the content in the cache.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2015:PMC,
  author =       "Yu Zhang and James Z. Wang and Jia Li",
  title =        "Parallel Massive Clustering of Discrete
                 Distributions",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "49:1--49:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700293",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The trend of analyzing big data in artificial
                 intelligence demands highly-scalable machine learning
                 algorithms, among which clustering is a fundamental and
                 arguably the most widely applied method. To extend the
                 applications of regular vector-based clustering
                 algorithms, the Discrete Distribution (D2) clustering
                 algorithm has been developed, aiming at clustering data
                 represented by bags of weighted vectors which are well
                 adopted data signatures in many emerging information
                 retrieval and multimedia learning applications.
                 However, the high computational complexity of
                 D2-clustering limits its impact in solving massive
                 learning problems. Here we present the parallel
                 D2-clustering (PD2-clustering) algorithm with
                 substantially improved scalability. We developed a
                 hierarchical multipass algorithm structure for parallel
                 computing in order to achieve a balance between the
                 individual-node computation and the integration process
                 of the algorithm. Experiments and extensive comparisons
                 between PD2-clustering and other clustering algorithms
                 are conducted on synthetic datasets. The results show
                 that the proposed parallel algorithm achieves
                 significant speed-up with minor accuracy loss. We apply
                 PD2-clustering to image concept learning. In addition,
                 by extending D2-clustering to symbolic data, we apply
                 PD2-clustering to protein sequence clustering. For both
                 applications, we demonstrate the high competitiveness
                 of our new algorithm in comparison with other
                 state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Baik:2015:EMR,
  author =       "Eilwoo Baik and Amit Pande and Prasant Mohapatra",
  title =        "Efficient {MAC} for Real-Time Video Streaming over
                 Wireless {LAN}",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "50:1--50:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2744412",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Wireless communication systems are highly prone to
                 channel errors. With video being a major player in
                 Internet traffic and undergoing exponential growth in
                 wireless domain, we argue for the need of a Video-aware
                 MAC (VMAC) to significantly improve the throughput and
                 delay performance of real-time video streaming service.
                 VMAC makes two changes to optimize wireless LAN for
                 video traffic: (a) It incorporates a
                 Perceptual-Error-Tolerance (PET) to the MAC frames by
                 reducing MAC retransmissions while minimizing any
                 impact on perceptual video quality; and (b) It uses a
                 group NACK-based Adaptive Window (NAW) of MAC frames to
                 improve both throughput and delay performance in
                 varying channel conditions. Through simulations and
                 experiments, we observe 56--89\% improvement in
                 throughput and 34--48\% improvement in delay
                 performance over legacy DCF and 802.11e schemes. VMAC
                 also shows 15--78\% improvement over legacy schemes
                 with multiple clients.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Antaris:2015:SSC,
  author =       "Stefanos Antaris and Dimitrios Rafailidis",
  title =        "Similarity Search over the Cloud Based on Image
                 Descriptors' Dimensions Value Cardinalities",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "51:1--51:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716315",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In recognition that in modern applications billions of
                 images are stored into distributed databases in
                 different logical or physical locations, we propose a
                 similarity search strategy over the cloud based on the
                 dimensions value cardinalities of image descriptors.
                 Our strategy has low preprocessing requirements by
                 dividing the computational cost of the preprocessing
                 steps into several nodes over the cloud and locating
                 the descriptors with similar dimensions value
                 cardinalities logically close. New images are inserted
                 into the distributed databases over the cloud
                 efficiently, by supporting dynamical update in
                 real-time. The proposed insertion algorithm has low
                 computational complexity, depending exclusively on the
                 dimensionality of descriptors and a small subset of
                 descriptors with similar dimensions value
                 cardinalities. Finally, an efficient query processing
                 algorithm is proposed, where the dimensions of image
                 descriptors are prioritized in the searching strategy,
                 assuming that dimensions of high value cardinalities
                 have more discriminative power than the dimensions of
                 low ones. The computation effort of the query
                 processing algorithm is divided into several nodes over
                 the cloud infrastructure. In our experiments with seven
                 publicly available datasets of image descriptors, we
                 show that the proposed similarity search strategy
                 outperforms competitive methods of single node,
                 parallel and cloud-based architectures, in terms of
                 preprocessing cost, search time and accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2015:AMD,
  author =       "Yin-Tzu Lin and I-Ting Liu and Jyh-Shing Roger Jang
                 and Ja-Ling Wu",
  title =        "Audio Musical Dice Game: a User-Preference-Aware
                 Medley Generating System",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "52:1--52:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710015",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes a framework for creating
                 user-preference-aware music medleys from users' music
                 collections. We treat the medley generation process as
                 an audio version of a musical dice game. Once the
                 user's collection has been analyzed, the system is able
                 to generate various pleasing medleys. This flexibility
                 allows users to create medleys according to the
                 specified conditions, such as the medley structure or
                 the must-use clips. Even users without musical
                 knowledge can compose medley songs from their favorite
                 tracks. The effectiveness of the system has been
                 evaluated through both objective and subjective
                 experiments on individual components in the system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2015:AVR,
  author =       "Bo-Hao Chen and Shih-Chia Huang",
  title =        "An Advanced Visibility Restoration Algorithm for
                 Single Hazy Images",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "53:1--53:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2726947",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Haze removal is the process by which horizontal
                 obscuration is eliminated from hazy images captured
                 during inclement weather. Images captured in natural
                 environments with varied weather conditions frequently
                 exhibit localized light sources or color-shift effects.
                 The occurrence of these effects presents a difficult
                 challenge for hazy image restoration, with which many
                 traditional restoration methods cannot adequately
                 contend. In this article, we present a new image haze
                 removal approach based on Fisher's linear
                 discriminant-based dual dark channel prior scheme in
                 order to solve the problems associated with the
                 presence of localized light sources and color shifts,
                 and thereby achieve effective restoration. Experimental
                 restoration results via qualitative and quantitative
                 evaluations show that our proposed approach can provide
                 higher haze-removal efficacy for images captured in
                 varied weather conditions than can the other
                 state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bao:2015:CPE,
  author =       "Bing-Kun Bao and Changsheng Xu and Weiqing Min and
                 Mohammod Shamim Hossain",
  title =        "Cross-Platform Emerging Topic Detection and
                 Elaboration from Multimedia Streams",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "54:1--54:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2730889",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the explosive growth of online media platforms in
                 recent years, it becomes more and more attractive to
                 provide users a solution of emerging topic detection
                 and elaboration. And this posts a real challenge to
                 both industrial and academic researchers because of the
                 overwhelming information available in multiple
                 modalities and with large outlier noises. This article
                 provides a method on emerging topic detection and
                 elaboration using multimedia streams cross different
                 online platforms. Specifically, Twitter, New York Times
                 and Flickr are selected for the work to represent the
                 microblog, news portal and imaging sharing platforms.
                 The emerging keywords of Twitter are firstly extracted
                 using aging theory. Then, to overcome the nature of
                 short length message in microblog, Robust
                 Cross-Platform Multimedia Co-Clustering (RCPMM-CC) is
                 proposed to detect emerging topics with three
                 novelties: (1) The data from different media platforms
                 are in multimodalities; (2) The coclustering is
                 processed based on a pairwise correlated structure, in
                 which the involved three media platforms are pairwise
                 dependent; (3) The noninformative samples are
                 automatically pruned away at the same time of
                 coclustering. In the last step of cross-platform
                 elaboration, we enrich each emerging topic with the
                 samples from New York Times and Flickr by computing the
                 implicit links between social topics and samples from
                 selected news and Flickr image clusters, which are
                 obtained by RCPMM-CC. Qualitative and quantitative
                 evaluation results demonstrate the effectiveness of our
                 method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2015:QQG,
  author =       "Yang Li and Azzedine Boukerche",
  title =        "{QuGu}: a Quality Guaranteed Video Dissemination
                 Protocol Over Urban Vehicular Ad Hoc Networks",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "55:1--55:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2725469",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video dissemination over Vehicular Ad Hoc Networks is
                 an attractive technology that supports many novel
                 applications. The merit of this work lies in the design
                 of an efficient video dissemination protocol that
                 provides high video quality at different data rates for
                 urban scenarios. Our objective is to improve received
                 video quality while meeting delay and packet loss. In
                 this work, we first employ a reliable scheme known as
                 connected dominating set, which is an efficient
                 receiver-based routing scheme for broadcasting video
                 content. To avoid repeated computing of the connected
                 dominating set, we add three statuses to each node. In
                 nonscalable video coding, the distribution of lost
                 frames can cause a major impact on video quality at the
                 receiver's end. Therefore, for the second step, we
                 employ Interleaving to spread out the burst losses and
                 to reduce the influence of loss distributions. Although
                 Interleaving can reduce the influence of cluster frame
                 loss, single packet loss is also a concern due to
                 collisions, and to intermittent disconnection in the
                 topology. In order to fix these single packet losses,
                 we propose a store-carry-forward scheme for the nodes
                 in order to retransmit the local buffer stored packets.
                 The results, when compared to the selected base
                 protocols, show that our proposed protocol is an
                 efficient solution for video dissemination over urban
                 Vehicular Ad Hoc Networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gaddam:2015:COM,
  author =       "Vamsidhar Reddy Gaddam and Ragnhild Eg and Ragnar
                 Langseth and Carsten Griwodz and P{\aa}l Halvorsen",
  title =        "The Cameraman Operating My Virtual Camera is
                 Artificial: Can the Machine Be as Good as a Human?",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "56:1--56:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2744411",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we argue that the energy spent in
                 designing autonomous camera control systems is not
                 spent in vain. We present a real-time virtual camera
                 system that can create smooth camera motion. Similar
                 systems are frequently benchmarked with the human
                 operator as the best possible reference; however, we
                 avoid a priori assumptions in our evaluations. Our main
                 question is simply whether we can design algorithms to
                 steer a virtual camera that can compete with the user
                 experience for recordings from an expert operator with
                 several years of experience? In this respect, we
                 present two low-complexity servoing methods that are
                 explored in two user studies. The results from the user
                 studies give a promising answer to the question
                 pursued. Furthermore, all components of the system meet
                 the real-time requirements on commodity hardware. The
                 growing capabilities of both hardware and network in
                 mobile devices give us hope that this system can be
                 deployed to mobile users in the near future. Moreover,
                 the design of the presented system takes into account
                 that services to concurrent users must be supported.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Natarajan:2015:MCC,
  author =       "Prabhu Natarajan and Pradeep K. Atrey and Mohan
                 Kankanhalli",
  title =        "Multi-Camera Coordination and Control in Surveillance
                 Systems: a Survey",
  journal =      j-TOMM,
  volume =       "11",
  number =       "4",
  pages =        "57:1--57:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710128",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 7 08:29:56 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The use of multiple heterogeneous cameras is becoming
                 more common in today's surveillance systems. In order
                 to perform surveillance tasks, effective coordination
                 and control in multi-camera systems is very important,
                 and is catching significant research attention these
                 days. This survey aims to provide researchers with a
                 state-of-the-art overview of various techniques for
                 multi-camera coordination and control (MC$^3$) that
                 have been adopted in surveillance systems. The existing
                 literature on MC$^3$ is presented through several
                 classifications based on the applicable architectures,
                 frameworks and the associated surveillance tasks.
                 Finally, a discussion on the open problems in
                 surveillance area that can be solved effectively using
                 MC$^3$ and the future directions in MC$^3$ research is
                 presented",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{You:2015:UPD,
  author =       "Shingchern D. You and Yi-Han Pu",
  title =        "Using Paired Distances of Signal Peaks in Stereo
                 Channels as Fingerprints for Copy Identification",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742059",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes to use the relative distances
                 between adjacent envelope peaks detected in stereo
                 audio as fingerprints for copy identification. The
                 matching algorithm used is the rough longest common
                 subsequence (RLCS) algorithm. The experimental results
                 show that the proposed approach has better
                 identification accuracy than an MPEG-7 based scheme for
                 distorted and noisy audio. When compared with other
                 schemes, the proposed scheme uses fewer bits with
                 comparable performance. The proposed fingerprints can
                 also be used in conjunction with the MPEG-7 based
                 scheme for lower computational burden.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{ElEssaili:2015:QBC,
  author =       "Ali {El Essaili} and Zibin Wang and Eckehard Steinbach
                 and Liang Zhou",
  title =        "{QoE}-Based Cross-Layer Optimization for Uplink Video
                 Transmission",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801124",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We study the problem of resource-efficient uplink
                 distribution of user-generated video content over
                 fourth-generation mobile networks. This is challenged
                 by (1) the capacity-limited and time-variant uplink
                 channel, (2) the resource-hungry upstreamed videos and
                 their dynamically changing complexity, and (3) the
                 different playout times of the video consumers. To
                 address these issues, we propose a systematic approach
                 for quality-of-experience (QoE)-based resource
                 optimization and uplink transmission of multiuser
                 generated video content. More specifically, we present
                 an analytical model for distributed scalable video
                 transmission at the mobile producers which considers
                 these constraints. This is complemented by a multiuser
                 cross-layer optimizer in the mobile network which
                 determines the transmission capacity for each mobile
                 terminal under current cell load and radio conditions.
                 Both optimal and low-complexity solutions are
                 presented. Simulation results for LTE uplink
                 transmission show that significant gains in perceived
                 video quality can be achieved by our cross-layer
                 resource optimization scheme. In addition, the
                 distributed optimization at the mobile producers can
                 further improve the user experience across the
                 different types of video consumers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2015:CSN,
  author =       "Li-Jia Li and David A. Shamma and Xiangnan Kong and
                 Sina Jafarpour and Roelof {Van Zwol} and Xuanhui Wang",
  title =        "{CelebrityNet}: a Social Network Constructed from
                 Large-Scale Online Celebrity Images",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801125",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Photos are an important information carrier for
                 implicit relationships. In this article, we introduce
                 an image based social network, called CelebrityNet,
                 built from implicit relationships encoded in a
                 collection of celebrity images. We analyze the social
                 properties reflected in this image-based social network
                 and automatically infer communities among the
                 celebrities. We demonstrate the interesting discoveries
                 of the CelebrityNet. We particularly compare the
                 inferred communities with human manually labeled ones
                 and show quantitatively that the automatically detected
                 communities are highly aligned with that of human
                 interpretation. Inspired by the uniqueness of visual
                 content and tag concepts within each community of the
                 CelebrityNet, we further demonstrate that the
                 constructed social network can serve as a knowledge
                 base for high-level visual recognition tasks. In
                 particular, this social network is capable of
                 significantly improving the performance of automatic
                 image annotation and classification of unknown
                 images.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2015:SDP,
  author =       "Bo Zhang and Nicola Conci and Francesco G. B. {De
                 Natale}",
  title =        "Segmentation of Discriminative Patches in Human
                 Activity Video",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2750780",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we present a novel approach to
                 segment discriminative patches in human activity
                 videos. First, we adopt the spatio-temporal interest
                 points (STIPs) to represent significant motion patterns
                 in the video sequence. Then, nonnegative sparse coding
                 is exploited to generate a sparse representation of
                 each STIP descriptor. We construct the feature vector
                 for each video by applying a two-stage sum-pooling and
                 l$_2$ -normalization operation. After training a
                 multi-class classifier through the error-correcting
                 code SVM, the discriminative portion of each video is
                 determined as the patch that has the highest confidence
                 while also being correctly classified according to the
                 video category. Experimental results show that the
                 video patches extracted by our method are more
                 separable, while preserving the perceptually relevant
                 portion of each activity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2015:WMZ,
  author =       "Hui Wang and Mun Choon Chan and Wei Tsang Ooi",
  title =        "Wireless Multicast for Zoomable Video Streaming",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801123",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Zoomable video streaming refers to a new class of
                 interactive video applications, where users can zoom
                 into a video stream to view a selected region of
                 interest in higher resolutions and pan around to move
                 the region of interest. The zoom and pan effects are
                 typically achieved by breaking the source video into a
                 grid of independently decodable tiles. Streaming the
                 tiles to a set of heterogeneous users using broadcast
                 is challenging, as users have different link rates and
                 different regions of interest at different resolution
                 levels. In this article, we consider the following
                 problem: Given the subset of tiles that each user
                 requested, the link rate of each user, and the
                 available time slots, at which resolution should each
                 tile be sent, to maximize the overall video quality
                 received by all users. We design an efficient algorithm
                 to solve this problem and evaluate the solution on a
                 testbed using 10 mobile devices. Our method is able to
                 achieve up to 12dB improvements over other heuristic
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bianco:2015:UPM,
  author =       "Simone Bianco and Gianluigi Ciocca",
  title =        "User Preferences Modeling and Learning for Pleasing
                 Photo Collage Generation",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801126",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we consider how to automatically
                 create pleasing photo collages created by placing a set
                 of images on a limited canvas area. The task is
                 formulated as an optimization problem. Differently from
                 existing state-of-the-art approaches, we here exploit
                 subjective experiments to model and learn pleasantness
                 from user preferences. To this end, we design an
                 experimental framework for the identification of the
                 criteria that need to be taken into account to generate
                 a pleasing photo collage. Five different thematic photo
                 datasets are used to create collages using
                 state-of-the-art criteria. A first subjective
                 experiment where several subjects evaluated the
                 collages, emphasizes that different criteria are
                 involved in the subjective definition of pleasantness.
                 We then identify new global and local criteria and
                 design algorithms to quantify them. The relative
                 importance of these criteria are automatically learned
                 by exploiting the user preferences, and new collages
                 are generated. To validate our framework, we performed
                 several psycho-visual experiments involving different
                 users. The results shows that the proposed framework
                 allows to learn a novel computational model which
                 effectively encodes an inter-user definition of
                 pleasantness. The learned definition of pleasantness
                 generalizes well to new photo datasets of different
                 themes and sizes not used in the learning. Moreover,
                 compared with two state-of-the-art approaches, the
                 collages created using our framework are preferred by
                 the majority of the users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Fu:2015:QBS,
  author =       "Bo Fu and Dirk Staehle and Gerald Kunzmann and
                 Eckehard Steinbach and Wolfgang Kellerer",
  title =        "{QoE}-Based {SVC} Layer Dropping in {LTE} Networks
                 Using Content-Aware Layer Priorities",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2754167",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The increasing popularity of mobile video streaming
                 applications has led to a high volume of video traffic
                 in mobile networks. As the base station, for instance,
                 the eNB in LTE networks, has limited physical
                 resources, it can be overloaded by this traffic. This
                 problem can be addressed by using Scalable Video Coding
                 (SVC), which allows the eNB to drop layers of the video
                 streams to dynamically adapt the bitrate. The impact of
                 bitrate adaptation on the Quality of Experience (QoE)
                 for the users depends on the content characteristics of
                 videos. As the current mobile network architectures do
                 not support the eNB in obtaining video content
                 information, QoE optimization schemes with explicit
                 signaling of content information have been proposed.
                 These schemes, however, require the eNB or a specific
                 optimization module to process the video content on the
                 fly in order to extract the required information. This
                 increases the computation and signaling overhead
                 significantly, raising the OPEX for mobile operators.
                 To address this issue, in this article, a content-aware
                 (CA) priority marking and layer dropping scheme is
                 proposed. The CA priority indicates a transmission
                 order for the layers of all transmitted videos across
                 all users, resulting from a comparison of their utility
                 versus rate characteristics. The CA priority values can
                 be determined at the P-GW on the fly, allowing mobile
                 operators to control the priority marking process.
                 Alternatively, they can be determined offline at the
                 video servers, avoiding real-time computation in the
                 core network. The eNB can perform content-aware SVC
                 layer dropping using only the priority values. No
                 additional content processing is required. The proposed
                 scheme is lightweight both in terms of architecture and
                 computation. The improvement in QoE is substantial and
                 very close to the performance obtained with the
                 computation and signaling-intensive QoE optimization
                 schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2015:ASM,
  author =       "Siqi Shen and Shun-Yun Hu and Alexandru Iosup and Dick
                 Epema",
  title =        "Area of Simulation: Mechanism and Architecture for
                 Multi-Avatar Virtual Environments",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Although Multi-Avatar Distributed Virtual Environments
                 (MAVEs) such as Real-Time Strategy (RTS) games
                 entertain daily hundreds of millions of online players,
                 their current designs do not scale. For example, even
                 popular RTS games such as the StarCraft series support
                 in a single game instance only up to 16 players and
                 only a few hundreds of avatars loosely controlled by
                 these players, which is a consequence of the
                 Event-Based Lockstep Simulation (EBLS) scalability
                 mechanism they employ. Through empirical analysis, we
                 show that a single Area of Interest (AoI), which is a
                 scalability mechanism that is sufficient for
                 single-avatar virtual environments (such as
                 Role-Playing Games), also cannot meet the scalability
                 demands of MAVEs. To enable scalable MAVEs, in this
                 work we propose Area of Simulation (AoS), a new
                 scalability mechanism, which combines and extends the
                 mechanisms of AoI and EBLS. Unlike traditional AoI
                 approaches, which employ only update-based operational
                 models, our AoS mechanism uses both event-based and
                 update-based operational models to manage not single,
                 but multiple areas of interest. Unlike EBLS, which is
                 traditionally used to synchronize the entire virtual
                 world, our AoS mechanism synchronizes only selected
                 areas of the virtual world. We further design an
                 AoS-based architecture, which is able to use both our
                 AoS and traditional AoI mechanisms simultaneously,
                 dynamically trading-off consistency guarantees for
                 scalability. We implement and deploy this architecture
                 and we demonstrate that it can operate with an order of
                 magnitude more avatars and a larger virtual world
                 without exceeding the resource capacity of players'
                 computers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lee:2015:LAR,
  author =       "Suk Kyu Lee and Seungho Yoo and Jongtack Jung and
                 Hwangnam Kim and Jihoon Ryoo",
  title =        "Link-Aware Reconfigurable Point-to-Point Video
                 Streaming for Mobile Devices",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2771438",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Even though people of all social standings use current
                 mobile devices in the wide spectrum of purpose from
                 entertainment tools to communication means, some issues
                 with real-time video streaming in hostile wireless
                 environment still exist. In this article, we introduce
                 CoSA, a link-aware real-time video streaming system for
                 mobile devices. The proposed system utilizes a 3D
                 camera to distinguish the region of importance (ROI)
                 and non-ROI region within the video frame. Based on the
                 link-state feedback from the receiver, the proposed
                 system allocates a higher bandwidth for the region that
                 is classified as ROI and a lower bandwidth for non-ROI
                 in the video stream by reducing the video's bit rate.
                 We implemented CoSA in a real test-bed where the IEEE
                 802.11 is employed as a medium for wireless networking.
                 Furthermore, we verified the effectiveness of the
                 proposed system by conducting a thorough empirical
                 study. The results indicate that the proposed system
                 enables real-time video streaming while maintaining a
                 consistent visual quality by dynamically reconfiguring
                 video coding parameters according to the link
                 quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2015:CAM,
  author =       "Ming-Ju Wu and Jyh-Shing R. Jang",
  title =        "Combining Acoustic and Multilevel Visual Features for
                 Music Genre Classification",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2801127",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Aug 28 06:14:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Most music genre classification approaches extract
                 acoustic features from frames to capture timbre
                 information, leading to the common framework of
                 bag-of-frames analysis. However, time-frequency
                 analysis is also vital for modeling music genres. This
                 article proposes multilevel visual features for
                 extracting spectrogram textures and their temporal
                 variations. A confidence-based late fusion is proposed
                 for combining the acoustic and visual features. The
                 experimental results indicated that the proposed method
                 achieved an accuracy improvement of approximately 14\%
                 and 2\% in the world's largest benchmark dataset (MASD)
                 and Unique dataset, respectively. In particular, the
                 proposed approach won the Music Information Retrieval
                 Evaluation eXchange (MIREX) music genre classification
                 contests from 2011 to 2013, demonstrating the
                 feasibility and necessity of combining acoustic and
                 visual features for classifying music genres.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{She:2015:ISI,
  author =       "James She and Alvin Chin and Feng Xia and Jon
                 Crowcroft",
  title =        "Introduction to: Special Issue on {Smartphone}-Based
                 Interactive Technologies, Systems, and Applications",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2820398",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhu:2015:SSB,
  author =       "Biao Zhu and Hongxin Zhang and Wei Chen and Feng Xia
                 and Ross Maciejewski",
  title =        "{ShotVis}: {Smartphone}-Based Visualization of {OCR}
                 Information from Images",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808210",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "While visualization has been widely used as a data
                 presentation tool in both desktop and mobile devices,
                 the rapid visualization of information from images is
                 still underexplored. In this work, we present a
                 smartphone image acquisition and visualization approach
                 for text-based data. Our prototype, ShotVis, takes
                 images of text captured from mobile devices and
                 extracts information for visualization. First,
                 scattered characters in the text are processed and
                 interactively reformulated to be stored as structured
                 data (i.e., tables of numbers, lists of words,
                 sentences). From there, ShotVis allows users to
                 interactively bind visual forms to the underlying data
                 and produce visualizations of the selected forms
                 through touch-based interactions. In this manner,
                 ShotVis can quickly summarize text from images into
                 word clouds, scatterplots, and various other
                 visualizations all through a simple click of the
                 camera. In this way, ShotVis facilitates the
                 interactive exploration of text data captured via
                 cameras in smartphone devices. To demonstrate our
                 prototype, several case studies are presented along
                 with one user study to demonstrate the effectiveness of
                 our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Venkatagiri:2015:ALG,
  author =       "Seshadri Padmanabha Venkatagiri and Mun Choon Chan and
                 Wei Tsang Ooi",
  title =        "Automated Link Generation for Sensor-Enriched
                 {Smartphone} Images",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808209",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The ubiquity of the smartphones makes them ideal
                 platforms for generating in-situ content. In
                 well-attended events, photos captured by attendees have
                 diverse views that could be subjected to occlusion and
                 abnormal lighting effects that could obscure the view.
                 Such unstructured photo collections also have
                 significant redundancy. Thus, a scene that is partially
                 occluded or has bad contrast in one photo may be
                 captured in another photo, possibly with higher
                 details. We propose an application called Autolink that
                 automatically establishes content-based links between
                 sensor-annotated photos in unstructured photo
                 collections captured using smartphones, such that users
                 could navigate between high-context and high-detail
                 images. This hierarchically structured image collection
                 facilitates the design of applications for navigation
                 and discovery, analytics about user photography
                 patterns, user taste, and content/event popularity.
                 Autolink includes a framework that constructs this
                 hierarchy efficiently and with little content-specific
                 training data by combining photo content processing
                 with associated sensor logs obtained from multiple
                 participants. We evaluated the performance of Autolink
                 on two real-world sensor tagged photo datasets. The
                 result shows that Autolink is able to efficiently
                 cluster photos at 20 times faster than candidate
                 algorithms, into the appropriate hierarchy with at
                 least 70\% precision and 37\% better recall than
                 candidate algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chu:2015:VCS,
  author =       "Chung-Hua Chu",
  title =        "Visual Comfort for Stereoscopic {$3$D} by Using Motion
                 Sensors on {$3$D} Mobile Devices",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808211",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Advanced 3D mobile devices attract a lot of attentions
                 for 3D visualization nowadays. Stereoscopic images and
                 video taken from the 3D mobile devices are
                 uncomfortable for 3D viewing experiences due to the
                 limited hardware for stereoscopic 3D stabilization. The
                 existing stereoscopic 3D stabilization methods are
                 computationally inefficient for the 3D mobile devices.
                 In this article, we point out that this critical issue
                 deteriorates the 3D viewing experiences on the 3D
                 mobile devices. To improve visual comfort, we propose
                 an efficient and effective algorithm to stabilize the
                 stereoscopic images and video for the 3D mobile
                 devices. To rectify the video jitter, we use the
                 gyroscope and accelerometer embedded on the mobile
                 devices to obtain the geometry information of the
                 cameras. Using a different method than
                 video-content-based motion estimation, our algorithm
                 based on the gyroscope and acceleration data can
                 achieve higher accuracy to effectively stabilize the
                 video. Therefore, our approach is robust in video
                 stabilization even under poor lighting and substantial
                 foreground motion. Our algorithm outperforms previous
                 approaches in not only smaller running time but also
                 the better comfort of the stereoscopic 3D visualization
                 for the 3D mobile devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2015:ECA,
  author =       "Kaikai Liu and Xiaolin Li",
  title =        "Enabling Context-Aware Indoor Augmented Reality via
                 {Smartphone} Sensing and Vision Tracking",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808208",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Augmented reality (AR) aims to render the world that
                 users see and overlay information that reflects the
                 real physical dynamics. The digital view could be
                 potentially projected near the Point-of-Interest (POI)
                 in a way that makes the virtual view attached to the
                 POI even when the camera moves. Achieving smooth
                 support for movements is a subject of extensive
                 studies. One of the key problems is where the augmented
                 information should be added to the field of vision in
                 real time. Existing solutions either leverage GPS
                 location for rendering outdoor AR views (hundreds of
                 kilometers away) or rely on image markers for
                 small-scale presentation (only for the marker region).
                 To realize AR applications under various scales and
                 dynamics, we propose a suite of algorithms for
                 fine-grained AR view tracking to improve the accuracy
                 of attitude and displacement estimation, reduce the
                 drift, eliminate the marker, and lower the computation
                 cost. Instead of requiring extremely high, accurate,
                 absolute locations, we propose multimodal solutions
                 according to mobility levels without additional
                 hardware requirement. Experimental results demonstrate
                 significantly less error in projecting and tracking the
                 AR view. These results are expected to make users
                 excited to explore their surroundings with enriched
                 content.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ahn:2015:SHG,
  author =       "Junho Ahn and James Williamson and Mike Gartrell and
                 Richard Han and Qin Lv and Shivakant Mishra",
  title =        "Supporting Healthy Grocery Shopping via Mobile
                 Augmented Reality",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808207",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Augmented reality (AR) applications have recently
                 become popular on modern smartphones. We explore the
                 effectiveness of this mobile AR technology in the
                 context of grocery shopping, in particular as a means
                 to assist shoppers in making healthier decisions as
                 they decide which grocery products to buy. We construct
                 an AR-assisted mobile grocery-shopping application that
                 makes real-time, customized recommendations of healthy
                 products to users and also highlights products to avoid
                 for various types of health concerns, such as allergies
                 to milk or nut products, low-sodium or low-fat diets,
                 and general caloric intake. We have implemented a
                 prototype of this AR-assisted mobile grocery shopping
                 application and evaluated its effectiveness in grocery
                 store aisles. Our application's evaluation with typical
                 grocery shoppers demonstrates that AR overlay tagging
                 of products reduces the search time to find healthy
                 food items, and that coloring the tags helps to improve
                 the user's ability to quickly and easily identify
                 recommended products, as well as products to avoid. We
                 have evaluated our application's functionality by
                 analyzing the data we collected from 15 in-person
                 actual grocery-shopping subjects and 104 online
                 application survey participants.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ma:2015:PUC,
  author =       "Sixuan Ma and Zheng Yan",
  title =        "{PSNController}: an Unwanted Content Control System in
                 Pervasive Social Networking Based on Trust Management",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808206",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Pervasive social networking (PSN) supports online and
                 instant social activities and communications in a
                 universal and pervasive manner on the basis of
                 heterogeneous networks. However, at the same time, when
                 mobile users expect useful and valuable contents via
                 PSN, they may also receive unwanted, unexpected, or
                 even malicious contents. These contents may intrude
                 user devices, occupy device memories, and irritate
                 mobile users. Unwanted content control in PSN has
                 become a crucial issue that impacts the success of PSN
                 usage. Nowadays, the literature still lacks a robust
                 and generic unwanted content control system that can be
                 practically applied. In this article, we present the
                 design and implementation of PSNController, an unwanted
                 content control system in PSN based on trust
                 management. We evaluate the system performance under a
                 variety of intrusions and attacks. The result shows the
                 system is effective with regard to accuracy,
                 efficiency, and robustness. It can control unwanted
                 contents in PSN according to trust evaluation. We
                 further study user acceptance on PSNController
                 prototype system based on a small-scale user study. We
                 receive sound user feedback on PSNController with
                 regard to perceived ease of use, perceived usefulness,
                 interface design, playfulness, and acceptance
                 attitude.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hao:2015:LEP,
  author =       "Fei Hao and Mingjie Jiao and Geyong Min and Laurence
                 T. Yang",
  title =        "Launching an Efficient Participatory Sensing Campaign:
                 a Smart Mobile Device-Based Approach",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808198",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Participatory sensing is a promising sensing paradigm
                 that enables collection, processing, dissemination and
                 analysis of the phenomena of interest by ordinary
                 citizens through their handheld sensing devices.
                 Participatory sensing has huge potential in many
                 applications, such as smart transportation and air
                 quality monitoring. However, participants may submit
                 low-quality, misleading, inaccurate, or even malicious
                 data if a participatory sensing campaign is not
                 launched effectively. Therefore, it has become a
                 significant issue to establish an efficient
                 participatory sensing campaign for improving the data
                 quality. This article proposes a novel five-tier
                 framework of participatory sensing and addresses
                 several technical challenges in this proposed framework
                 including: (1) optimized deployment of data collection
                 points (DC-points); and (2) efficient recruitment
                 strategy of participants. Toward this end, the
                 deployment of DC-points is formulated as an
                 optimization problem with maximum utilization of sensor
                 and then a Wise-Dynamic DC-points Deployment (WD3)
                 algorithm is designed for high-quality sensing.
                 Furthermore, to guarantee the reliable sensing data
                 collection and communication, a trajectory-based
                 strategy for participant recruitment is proposed to
                 enable campaign organizers to identify well-suited
                 participants for data sensing based on a joint
                 consideration of temporal availability, trust, and
                 energy. Extensive experiments and performance analysis
                 of the proposed framework and associated algorithms are
                 conducted. The results demonstrate that the proposed
                 algorithm can achieve a good sensing coverage with a
                 smaller number of DC-points, and the participants that
                 are termed as social sensors are easily selected, to
                 evaluate the feasibility and extensibility of the
                 proposed recruitment strategies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rawat:2015:CAP,
  author =       "Yogesh Singh Rawat and Mohan S. Kankanhalli",
  title =        "Context-Aware Photography Learning for Smart Mobile
                 Devices",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808199",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this work we have developed a photography model
                 based on machine learning which can assist a user in
                 capturing high quality photographs. As scene
                 composition and camera parameters play a vital role in
                 aesthetics of a captured image, the proposed method
                 addresses the problem of learning photographic
                 composition and camera parameters. Further, we observe
                 that context is an important factor from a photography
                 perspective, we therefore augment the learning with
                 associated contextual information. The proposed method
                 utilizes publicly available photographs along with
                 social media cues and associated metainformation in
                 photography learning. We define context features based
                 on factors such as time, geolocation, environmental
                 conditions and type of image, which have an impact on
                 photography. We also propose the idea of computing the
                 photographic composition basis, eigenrules and
                 baserules, to support our composition learning. The
                 proposed system can be used to provide feedback to the
                 user regarding scene composition and camera parameters
                 while the scene is being captured. It can also
                 recommend position in the frame where people should
                 stand for better composition. Moreover, it also
                 provides camera motion guidance for pan, tilt and zoom
                 to the user for improving scene composition.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Canazza:2015:ATM,
  author =       "Sergio Canazza and Carlo Fantozzi and Niccol`o
                 Pretto",
  title =        "Accessing Tape Music Documents on Mobile Devices",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808200",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The aim of this article is to present and discuss an
                 innovative methodology aimed at accessing digitized
                 copies of historical tape music audio documents; the
                 methodology leverages on the multimedia and
                 multisensory capabilities of mobile devices to provide
                 an unprecedented level of fruition. In addition to the
                 methodology, and stemming from it, we present an actual
                 software application for Android tablet devices. This
                 novel piece of software was designed and developed in a
                 multidisciplinary team involving engineers as well as
                 musicians, composers, and archivists. The strongest
                 element in our work is the fact that it follows a
                 rigorous process and it is based on the principles of
                 philological awareness; thus, it also takes into
                 consideration the critical points in the musicologist's
                 domain such as (i) the definition of preservation
                 (i.e., master) copy, (ii) the importance of secondary
                 information, (iii) the history of production and
                 transmission of audio documents.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2015:SCC,
  author =       "Xiping Hu and Junqi Deng and Jidi Zhao and Wenyan Hu
                 and Edith C.-H. Ngai and Renfei Wang and Johnny Shen
                 and Min Liang and Xitong Li and Victor C. M. Leung and
                 Yu-Kwong Kwok",
  title =        "{SAfeDJ}: a Crowd-Cloud Codesign Approach to
                 Situation-Aware Music Delivery for Drivers",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "21:1--21:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808201",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Driving is an integral part of our everyday lives, but
                 it is also a time when people are uniquely vulnerable.
                 Previous research has demonstrated that not only does
                 listening to suitable music while driving not impair
                 driving performance, but it could lead to an improved
                 mood and a more relaxed body state, which could improve
                 driving performance and promote safe driving
                 significantly. In this article, we propose SAfeDJ, a
                 smartphone-based situation-aware music recommendation
                 system, which is designed to turn driving into a safe
                 and enjoyable experience. SAfeDJ aims at helping
                 drivers to diminish fatigue and negative emotion. Its
                 design is based on novel interactive methods, which
                 enable in-car smartphones to orchestrate multiple
                 sources of sensing data and the drivers' social
                 context, in collaboration with cloud computing to form
                 a seamless crowdsensing solution. This solution enables
                 different smartphones to collaboratively recommend
                 preferable music to drivers according to each driver's
                 specific situations in an automated and intelligent
                 manner. Practical experiments of SAfeDJ have proved its
                 effectiveness in music-mood analysis, and mood-fatigue
                 detections of drivers with reasonable computation and
                 communication overheads on smartphones. Also, our user
                 studies have demonstrated that SAfeDJ helps to decrease
                 fatigue degree and negative mood degree of drivers by
                 49.09\% and 36.35\%, respectively, compared to
                 traditional smartphone-based music player under similar
                 driving situations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Baldauf:2015:ISG,
  author =       "Matthias Baldauf and Peter Fr{\"o}hlich and Florence
                 Adegeye and Stefan Suette",
  title =        "Investigating On-Screen Gamepad Designs for
                 {Smartphone}-Controlled Video Games",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "22:1--22:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808202",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "On-screen gamepads are increasingly used as
                 controllers for video games on distant screens, yet
                 lack the typical tactile feedback known from hardware
                 controllers. We conducted a comparative lab study to
                 investigate four smartphone gamepads inspired by
                 traditional game controllers and mobile game controls
                 (directional buttons, directional pad, floating
                 joystick, tilt control). The study consisted of both
                 completing a formal control test as well as controlling
                 two popular video games of different genres (Pac-Man
                 and Super Mario Bros.). The results indicate that the
                 directional buttons require the most attention of the
                 user, however, work precisely for direction-restricted
                 navigational tasks. Directional pad and joystick showed
                 a similar performance, yet they encourage drifting and
                 unintended operations when the user is focused on the
                 remote screen. While currently unfamiliar to many
                 users, the floating joystick can reduce the glances at
                 the device. Tilt turned out to be not sufficiently
                 precise and quick for the investigated tasks. The
                 article concludes with derived design guidelines with
                 easily realizable measures for typical contexts such as
                 casual gaming at home or spontaneous gaming on public
                 displays.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bental:2015:SSL,
  author =       "Diana S. Bental and Eliza Papadopoulou and Nicholas K.
                 Taylor and M. Howard Williams and Fraser R. Blackmun
                 and Idris S. Ibrahim and Mei Yii Lim and Ioannis
                 Mimtsoudis and Stuart W. Whyte and Edel Jennings",
  title =        "Smartening Up the Student Learning Experience with
                 Ubiquitous Media",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "23:1--23:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808203",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article describes how an experimental platform
                 for social, mobile and ubiquitous computing has been
                 used in a wide-ranging longitudinal ``in the wild''
                 case study of the platform with a set of third-party
                 services. The article outlines some of the relevant
                 aspects of the platform, including built-in support for
                 community formation, for context sensitivity, automated
                 learning and adaptation to the user, and for management
                 of privacy and trust relationships. The platform
                 architecture is based on the notion of Cooperating
                 Smart Spaces (CSSs), where a CSS is a partition of the
                 platform corresponding to a single user and distributed
                 over the devices belonging to that user. Three of the
                 case study services were intended for use in a physical
                 environment specifically created to support ubiquitous
                 intelligence; they were highly interactive and used
                 shared screens, voice input and gestural interaction.
                 Another three ubiquitous services were available
                 throughout the university environment as mobile and
                 desktop services. The case study exploited this
                 architecture's ability to integrate multiple novel
                 applications and interface devices and to deliver them
                 flexibly in these different environments. The platform
                 proved to be stable and reliable and the study shows
                 that treating a provider of services and resources (the
                 University) as a CSS is instrumental in enabling the
                 platform to provide this range of services across
                 differing environments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hung:2015:ISI,
  author =       "Hayley Hung and George Toderici",
  title =        "Introduction to: Special Issue on Extended Best Papers
                 from {ACM Multimedia 2014}",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2820400",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kim:2015:ERD,
  author =       "Yelin Kim and Emily Mower Provost",
  title =        "Emotion Recognition During Speech Using Dynamics of
                 Multiple Regions of the Face",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "25:1--25:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808204",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The need for human-centered, affective multimedia
                 interfaces has motivated research in automatic emotion
                 recognition. In this article, we focus on facial
                 emotion recognition. Specifically, we target a domain
                 in which speakers produce emotional facial expressions
                 while speaking. The main challenge of this domain is
                 the presence of modulations due to both emotion and
                 speech. For example, an individual's mouth movement may
                 be similar when he smiles and when he pronounces the
                 phoneme /IY/, as in ``cheese''. The result of this
                 confusion is a decrease in performance of facial
                 emotion recognition systems. In our previous work, we
                 investigated the joint effects of emotion and speech on
                 facial movement. We found that it is critical to employ
                 proper temporal segmentation and to leverage knowledge
                 of spoken content to improve classification
                 performance. In the current work, we investigate the
                 temporal characteristics of specific regions of the
                 face, such as the forehead, eyebrow, cheek, and mouth.
                 We present methodology that uses the temporal patterns
                 of specific regions of the face in the context of a
                 facial emotion recognition system. We test our proposed
                 approaches on two emotion datasets, the IEMOCAP and
                 SAVEE datasets. Our results demonstrate that the
                 combination of emotion recognition systems based on
                 different facial regions improves overall accuracy
                 compared to systems that do not leverage different
                 characteristics of individual regions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Feng:2015:CAC,
  author =       "Fangxiang Feng and Xiaojie Wang and Ruifan Li and
                 Ibrar Ahmad",
  title =        "Correspondence Autoencoders for Cross-Modal
                 Retrieval",
  journal =      j-TOMM,
  volume =       "12",
  number =       "1s",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2808205",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 21 16:37:02 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article considers the problem of cross-modal
                 retrieval, such as using a text query to search for
                 images and vice-versa. Based on different autoencoders,
                 several novel models are proposed here for solving this
                 problem. These models are constructed by correlating
                 hidden representations of a pair of autoencoders. A
                 novel optimal objective, which minimizes a linear
                 combination of the representation learning errors for
                 each modality and the correlation learning error
                 between hidden representations of two modalities, is
                 used to train the model as a whole. Minimizing the
                 correlation learning error forces the model to learn
                 hidden representations with only common information in
                 different modalities, while minimizing the
                 representation learning error makes hidden
                 representations good enough to reconstruct inputs of
                 each modality. To balance the two kind of errors
                 induced by representation learning and correlation
                 learning, we set a specific parameter in our models.
                 Furthermore, according to the modalities the models
                 attempt to reconstruct they are divided into two
                 groups. One group including three models is named
                 multimodal reconstruction correspondence autoencoder
                 since it reconstructs both modalities. The other group
                 including two models is named unimodal reconstruction
                 correspondence autoencoder since it reconstructs a
                 single modality. The proposed models are evaluated on
                 three publicly available datasets. And our experiments
                 demonstrate that our proposed correspondence
                 autoencoders perform significantly better than three
                 canonical correlation analysis based models and two
                 popular multimodal deep models on cross-modal retrieval
                 tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2016:SPS,
  author =       "Longyu Zhang and Haiwei Dong and Abdulmotaleb {El
                 Saddik}",
  title =        "From {$3$D} Sensing to Printing: a Survey",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "27:1--27:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818710",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Three-dimensional (3D) sensing and printing
                 technologies have reshaped our world in recent years.
                 In this article, a comprehensive overview of techniques
                 related to the pipeline from 3D sensing to printing is
                 provided. We compare the latest 3D sensors and 3D
                 printers and introduce several sensing, postprocessing,
                 and printing techniques available from both commercial
                 deployments and published research. In addition, we
                 demonstrate several devices, software, and experimental
                 results of our related projects to further elaborate
                 details of this process. A case study is conducted to
                 further illustrate the possible tradeoffs during the
                 process of this pipeline. Current progress, future
                 research trends, and potential risks of 3D technologies
                 are also discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Petrangeli:2016:QDR,
  author =       "Stefano Petrangeli and Jeroen Famaey and Maxim Claeys
                 and Steven Latr{\'e} and Filip {De Turck}",
  title =        "{QoE}-Driven Rate Adaptation Heuristic for Fair
                 Adaptive Video Streaming",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "28:1--28:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818361",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "HTTP Adaptive Streaming (HAS) is quickly becoming the
                 de facto standard for video streaming services. In HAS,
                 each video is temporally segmented and stored in
                 different quality levels. Rate adaptation heuristics,
                 deployed at the video player, allow the most
                 appropriate level to be dynamically requested, based on
                 the current network conditions. It has been shown that
                 today's heuristics underperform when multiple clients
                 consume video at the same time, due to fairness issues
                 among clients. Concretely, this means that different
                 clients negatively influence each other as they compete
                 for shared network resources. In this article, we
                 propose a novel rate adaptation algorithm called FINEAS
                 (Fair In-Network Enhanced Adaptive Streaming), capable
                 of increasing clients' Quality of Experience (QoE) and
                 achieving fairness in a multiclient setting. A key
                 element of this approach is an in-network system of
                 coordination proxies in charge of facilitating fair
                 resource sharing among clients. The strength of this
                 approach is threefold. First, fairness is achieved
                 without explicit communication among clients and thus
                 no significant overhead is introduced into the network.
                 Second, the system of coordination proxies is
                 transparent to the clients, that is, the clients do not
                 need to be aware of its presence. Third, the HAS
                 principle is maintained, as the in-network components
                 only provide the clients with new information and
                 suggestions, while the rate adaptation decision remains
                 the sole responsibility of the clients themselves. We
                 evaluate this novel approach through simulations, under
                 highly variable bandwidth conditions and in several
                 multiclient scenarios. We show how the proposed
                 approach can improve fairness up to 80\% compared to
                 state-of-the-art HAS heuristics in a scenario with
                 three networks, each containing 30 clients streaming
                 video at the same time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sun:2016:SOR,
  author =       "Shaoyan Sun and Wengang Zhou and Qi Tian and Houqiang
                 Li",
  title =        "Scalable Object Retrieval with Compact Image
                 Representation from Generic Object Regions",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "29:1--29:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818708",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In content-based visual object retrieval, image
                 representation is one of the fundamental issues in
                 improving retrieval performance. Existing works adopt
                 either local SIFT-like features or holistic features,
                 and may suffer sensitivity to noise or poor
                 discrimination power. In this article, we propose a
                 compact representation for scalable object retrieval
                 from few generic object regions. The regions are
                 identified with a general object detector and are
                 described with a fusion of learning-based features and
                 aggregated SIFT features. Further, we compress feature
                 representation in large-scale image retrieval
                 scenarios. We evaluate the performance of the proposed
                 method on two public ground-truth datasets, with
                 promising results. Experimental results on a
                 million-scale image database demonstrate superior
                 retrieval accuracy with efficiency gain in both
                 computation and memory usage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ebrahim:2016:MIB,
  author =       "Mansoor Ebrahim and Wai Chong Chia",
  title =        "Multiview Image Block Compressive Sensing with Joint
                 Multiphase Decoding for Visual Sensor Network",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "30:1--30:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818712",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, a multiview image compression
                 framework, which involves the use of Block-based
                 Compressive Sensing (BCS) and Joint Multiphase Decoding
                 (JMD), is proposed for a Visual Sensor Network (VSN).
                 In the proposed framework, one of the sensor nodes is
                 configured to serve as the reference node, the others
                 as nonreference nodes. The images are encoded
                 independently using the BCS to produce two observed
                 measurements that are transmitted to the host
                 workstation. In this case, the nonreference nodes
                 always encoded the images (I$_{NR}$ ) at a lower
                 subrate when compared with the images from the
                 reference nodes (I$_R$ ). The idea is to improve the
                 reconstruction of I$_{NR}$ using I$_R$. After the two
                 observed measurements are received by the host
                 workstation, they are first decoded independently, then
                 image registration is applied to align I$_R$ onto the
                 same plane of I$_{NR}$. The aligned I$_R$ is then fused
                 with I$_{NR}$, using wavelets to produce the projected
                 image I$_P$. Subsequently, the difference between the
                 measurements of the I$_P$ and I$_{NR}$ is calculated.
                 The difference is then decoded and added to I$_P$ to
                 produce the final reconstructed I$_{NR}$. The
                 simulation results show that the proposed framework is
                 able to improve the quality of I$_{NR}$ on average by
                 2dB to 3dB at lower subrates when compared with other
                 Compressive Sensing (CS)--based multiview image
                 compression frameworks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pang:2016:OQA,
  author =       "Lei Pang and Chong-Wah Ngo",
  title =        "Opinion Question Answering by Sentiment Clip
                 Localization",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "31:1--31:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818711",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article considers multimedia question answering
                 beyond factoid and how-to questions. We are interested
                 in searching videos for answering opinion-oriented
                 questions that are controversial and hotly debated.
                 Examples of questions include ``Should Edward Snowden
                 be pardoned?'' and ``Obamacare-unconstitutional or
                 not?''. These questions often invoke emotional
                 response, either positively or negatively, hence are
                 likely to be better answered by videos than texts, due
                 to the vivid display of emotional signals visible
                 through facial expression and speaking tone.
                 Nevertheless, a potential answer of duration 60s may be
                 embedded in a video of 10min, resulting in degraded
                 user experience compared to reading the answer in text
                 only. Furthermore, a text-based opinion question may be
                 short and vague, while the video answers could be
                 verbal, less structured grammatically, and noisy
                 because of errors in speech transcription. Direct
                 matching of words or syntactic analysis of sentence
                 structure, such as adopted by factoid and how-to
                 question-answering, is unlikely to find video answers.
                 The first problem, the answer localization, is
                 addressed by audiovisual analysis of the emotional
                 signals in videos for locating video segments likely
                 expressing opinions. The second problem, questions and
                 answers matching, is tackled by a deep architecture
                 that nonlinearly matches text words in questions and
                 speeches in videos. Experiments are conducted on eight
                 controversial topics based on questions crawled from
                 Yahoo! Answers and Internet videos from YouTube.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Papapanagiotou:2016:ICB,
  author =       "Vasileios Papapanagiotou and Christos Diou and
                 Anastasios Delopoulos",
  title =        "Improving Concept-Based Image Retrieval with Training
                 Weights Computed from Tags",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "32:1--32:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790230",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents a novel approach to training
                 classifiers for concept detection using tags and a
                 variant of Support Vector Machine that enables the
                 usage of training weights per sample. Combined with an
                 appropriate tag weighting mechanism, more relevant
                 samples play a more important role in the calibration
                 of the final concept-detector model. We propose a
                 complete, automated framework that (i) calculates
                 relevance scores for each image-concept pair based on
                 image tags, (ii) transforms the scores into relevance
                 probabilities and automatically annotates each image
                 according to this probability, (iii) transforms either
                 the relevance scores or the probabilities into
                 appropriate training weights and finally, (iv)
                 incorporates the training weights and the visual
                 features into a Fuzzy Support Vector Machine classifier
                 to build the concept-detector model. The framework can
                 be applied to online public collections, by gathering a
                 large pool of diverse images, and using the calculated
                 probability to select a training set and the associated
                 training weights. To evaluate our argument, we
                 experiment on two large annotated datasets. Experiments
                 highlight the retrieval effectiveness of the proposed
                 approach. Furthermore, experiments with various levels
                 of annotation error show that using weights derived
                 from tags significantly increases the robustness of the
                 resulting concept detectors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2016:AGV,
  author =       "Xuyong Yang and Tao Mei and Ying-Qing Xu and Yong Rui
                 and Shipeng Li",
  title =        "Automatic Generation of Visual-Textual Presentation
                 Layout",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "33:1--33:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818709",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Visual-textual presentation layout (e.g., digital
                 magazine cover, poster, Power Point slides, and any
                 other rich media), which combines beautiful image and
                 overlaid readable texts, can result in an eye candy
                 touch to attract users' attention. The designing of
                 visual-textual presentation layout is therefore
                 becoming ubiquitous in both commercially printed
                 publications and online digital magazines. However,
                 handcrafting aesthetically compelling layouts still
                 remains challenging for many small businesses and
                 amateur users. This article presents a system to
                 automatically generate visual-textual presentation
                 layouts by investigating a set of aesthetic design
                 principles, through which an average user can easily
                 create visually appealing layouts. The system is
                 attributed with a set of topic-dependent layout
                 templates and a computational framework integrating
                 high-level aesthetic principles (in a top-down manner)
                 and low-level image features (in a bottom-up manner).
                 The layout templates, designed with prior knowledge
                 from domain experts, define spatial layouts, semantic
                 colors, harmonic color models, and font emotion and
                 size constraints. We formulate the typography as an
                 energy optimization problem by minimizing the cost of
                 text intrusion, the utility of visual space, and the
                 mismatch of information importance in perception and
                 semantics, constrained by the automatically selected
                 template and further preserving color harmonization. We
                 demonstrate that our designs achieve the best reading
                 experience compared with the reimplementation of parts
                 of existing state-of-the-art designs through a series
                 of user studies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2016:MCR,
  author =       "Xuelong Li and Mulin Chen and Qi Wang",
  title =        "Measuring Collectiveness via Refined Topological
                 Similarity",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "34:1--34:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2854000",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Crowd system has motivated a surge of interests in
                 many areas of multimedia, as it contains plenty of
                 information about crowd scenes. In crowd systems,
                 individuals tend to exhibit collective behaviors, and
                 the motion of all those individuals is called
                 collective motion. As a comprehensive descriptor of
                 collective motion, collectiveness has been proposed to
                 reflect the degree of individuals moving as an
                 entirety. Nevertheless, existing works mostly have
                 limitations to correctly find the individuals of a
                 crowd system and precisely capture the various
                 relationships between individuals, both of which are
                 essential to measure collectiveness. In this article,
                 we propose a collectiveness-measuring method that is
                 capable of quantifying collectiveness accurately. Our
                 main contributions are threefold: (1) we compute
                 relatively accurate collectiveness by making the
                 tracked feature points represent the individuals more
                 precisely with a point selection strategy; (2) we
                 jointly investigate the spatial-temporal information of
                 individuals and utilize it to characterize the
                 topological relationship between individuals by
                 manifold learning; (3) we propose a stability
                 descriptor to deal with the irregular individuals,
                 which influence the calculation of collectiveness.
                 Intensive experiments on the simulated and real world
                 datasets demonstrate that the proposed method is able
                 to compute relatively accurate collectiveness and keep
                 high consistency with human perception.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tyson:2016:MAM,
  author =       "Gareth Tyson and Yehia Elkhatib and Nishanth Sastry
                 and Steve Uhlig",
  title =        "Measurements and Analysis of a Major Adult Video
                 Portal",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "35:1--35:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2854003",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Today, the Internet is a large multimedia delivery
                 infrastructure, with websites such as YouTube appearing
                 at the top of most measurement studies. However, most
                 traffic studies have ignored an important domain: adult
                 multimedia distribution. Whereas, traditionally, such
                 services were provided primarily via bespoke websites,
                 recently these have converged towards what is known as
                 ``Porn 2.0''. These services allow users to upload,
                 view, rate, and comment on videos for free (much like
                 YouTube). Despite their scale, we still lack even a
                 basic understanding of their operation. This article
                 addresses this gap by performing a large-scale study of
                 one of the most popular Porn 2.0 websites: YouPorn. Our
                 measurements reveal a global delivery infrastructure
                 that we have repeatedly crawled to collect statistics
                 (on 183k videos). We use this data to characterise the
                 corpus, as well as to inspect popularity trends and how
                 they relate to other features, for example, categories
                 and ratings. To explore our discoveries further, we use
                 a small-scale user study, highlighting key system
                 implications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Thomee:2016:FSP,
  author =       "Bart Thomee and Ioannis Arapakis and David A. Shamma",
  title =        "Finding Social Points of Interest from Georeferenced
                 and Oriented Online Photographs",
  journal =      j-TOMM,
  volume =       "12",
  number =       "2",
  pages =        "36:1--36:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2854004",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 3 17:36:33 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Points of interest are an important requirement for
                 location-based services, yet they are editorially
                 curated and maintained, either professionally or
                 through community. Beyond the laborious manual
                 annotation task, further complications arise as points
                 of interest may appear, relocate, or disappear over
                 time, and may be relevant only to specific communities.
                 To assist, complement, or even replace manual
                 annotation, we propose a novel method for the automatic
                 localization of points of interest depicted in photos
                 taken by people across the world. Our technique
                 exploits the geographic coordinates and the compass
                 direction supplied by modern cameras, while accounting
                 for possible measurement errors due to the variability
                 in accuracy of the sensors that produced them. We
                 statistically demonstrate that our method significantly
                 outperforms techniques from the research literature on
                 the task of estimating the geographic coordinates and
                 geographic footprints of points of interest in various
                 cities, even when photos are involved in the estimation
                 process that do not show the point of interest at
                 all.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{delBimbo:2016:PEC,
  author =       "Alberto del Bimbo",
  title =        "From the Past {Editor-In-Chief}",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "37:1--37:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903774",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37e",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2016:SPR,
  author =       "Luming Zhang and Xuelong Li and Liqiang Nie and Yan
                 Yan and Roger Zimmermann",
  title =        "Semantic Photo Retargeting Under Noisy Image Labels",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "37:1--37:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2886775",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the popularity of mobile devices, photo
                 retargeting has become a useful technique that adapts a
                 high-resolution photo onto a low-resolution screen.
                 Conventional approaches are limited in two aspects. The
                 first factor is the de-emphasized role of semantic
                 content that is many times more important than
                 low-level features in photo aesthetics. Second is the
                 importance of image spatial modeling: toward a
                 semantically reasonable retargeted photo, the spatial
                 distribution of objects within an image should be
                 accurately learned. To solve these two problems, we
                 propose a new semantically aware photo retargeting that
                 shrinks a photo according to region semantics. The key
                 technique is a mechanism transferring semantics of
                 noisy image labels (inaccurate labels predicted by a
                 learner like an SVM) into different image regions. In
                 particular, we first project the local aesthetic
                 features (graphlets in this work) onto a semantic
                 space, wherein image labels are selectively encoded
                 according to their noise level. Then, a
                 category-sharing model is proposed to robustly discover
                 the semantics of each image region. The model is
                 motivated by the observation that the semantic
                 distribution of graphlets from images tagged by a
                 common label remains stable in the presence of noisy
                 labels. Thereafter, a spatial pyramid is constructed to
                 hierarchically encode the spatial layout of graphlet
                 semantics. Based on this, a probabilistic model is
                 proposed to enforce the spatial layout of a retargeted
                 photo to be maximally similar to those from the
                 training photos. Experimental results show that (1)
                 noisy image labels predicted by different learners can
                 improve the retargeting performance, according to both
                 qualitative and quantitative analysis, and (2) the
                 category-sharing model stays stable even when 32.36\%
                 of image labels are incorrectly predicted.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhou:2016:MDD,
  author =       "Liang Zhou",
  title =        "Mobile Device-to-Device Video Distribution: Theory and
                 Application",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "38:1--38:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2886776",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "As video traffic has dominated the data flow of
                 smartphones, traditional cellular communications face
                 substantial transmission challenges. In this work, we
                 study mobile device-to-device (D2D) video distribution
                 that leverages the storage and communication capacities
                 of smartphones. In such a mobile distributed framework,
                 D2D communication represents an opportunistic process
                 to selectively store and transmit local videos to meet
                 the future demand of others. The performance is
                 measured by the service time, which denotes the elapsed
                 period for fulfilling the demand, and the corresponding
                 implementation of each device depends on the video's
                 demand, availability, and size. The main contributions
                 of this work lie in (1) considering the impact of video
                 size in a practical mobile D2D video distribution
                 scenario and proposing a general global estimation of
                 the video distribution based on limited and local
                 observations; (2) designing a purely distributed D2D
                 video distribution scheme without the monitoring of any
                 central controller; and (3) providing a practical
                 implementation of the scheme, which does not need to
                 know the video availability, user demand, and device
                 mobility. Numerical results have demonstrated the
                 efficiency and robustness of the proposed scheme.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ravi:2016:FAL,
  author =       "Hareesh Ravi and A. V. Subramanyam and Sabu Emmanuel",
  title =        "Forensic Analysis of Linear and Nonlinear Image
                 Filtering Using Quantization Noise",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "39:1--39:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857069",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The availability of intelligent image editing
                 techniques and antiforensic algorithms, make it
                 convenient to manipulate an image and to hide the
                 artifacts that it might have produced in the process.
                 Real world forgeries are generally followed by the
                 application of enhancement techniques such as filtering
                 and/or conversion of the image format to suppress the
                 forgery artifacts. Though several techniques evolved in
                 the direction of detecting some of these manipulations,
                 additional operations like recompression, nonlinear
                 filtering, and other antiforensic methods during
                 forgery are not deeply investigated. Toward this, we
                 propose a robust method to detect whether a given image
                 has undergone filtering (linear or nonlinear) based
                 enhancement, possibly followed by format conversion
                 after forgery. In the proposed method, JPEG
                 quantization noise is obtained using natural image
                 prior and quantization noise models. Transition
                 probability features extracted from the quantization
                 noise are used for machine learning based detection and
                 classification. We test the effectiveness of the
                 algorithm in classifying the class of the filter
                 applied and the efficacy in detecting filtering in low
                 resolution images. Experiments are performed to compare
                 the performance of the proposed technique with
                 state-of-the-art forensic filtering detection
                 algorithms. It is found that the proposed technique is
                 superior in most of the cases. Also, experiments
                 against popular antiforensic algorithms show the
                 counter antiforensic robustness of the proposed
                 technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2016:SND,
  author =       "Xianjun Hu and Weiming Zhang and Ke Li and Honggang Hu
                 and Nenghai Yu",
  title =        "Secure Nonlocal Denoising in Outsourced Images",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "40:1--40:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2886777",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Signal processing in the encrypted domain becomes a
                 desired technique to protect privacy of outsourced data
                 in cloud. In this article, we propose a double-cipher
                 scheme to implement nonlocal means (NLM) denoising in
                 encrypted images. In this scheme, one ciphertext is
                 generated by the Paillier scheme, which enables the
                 mean filter, and the other is obtained by a
                 privacy-preserving transform, which enables the
                 nonlocal search. By the privacy-preserving transform,
                 the cloud server can search the similar pixel blocks in
                 the ciphertexts with the same speed as in the
                 plaintexts; thus, the proposed method can be executed
                 fast. To enhance the security, we randomly permutate
                 both ciphertexts. To reduce the denoising complexity
                 caused by random permutation, a random NLM method is
                 exploited in the encrypted domain. The experimental
                 results show that the quality of denoised images in the
                 encrypted domain is comparable to that obtained in the
                 plain domain.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Calagari:2016:DPS,
  author =       "Kiana Calagari and Tarek Elgamal and Khaled Diab and
                 Krzysztof Templin and Piotr Didyk and Wojciech Matusik
                 and Mohamed Hefeeda",
  title =        "Depth Personalization and Streaming of Stereoscopic
                 Sports Videos",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "41:1--41:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890103",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Current three-dimensional displays cannot fully
                 reproduce all depth cues used by a human observer in
                 the real world. Instead, they create only an illusion
                 of looking at a three-dimensional scene. This leads to
                 a number of challenges during the content creation
                 process. To assure correct depth reproduction and
                 visual comfort, either the acquisition setup has to be
                 carefully controlled or additional postprocessing
                 techniques have to be applied. Furthermore, these
                 manipulations need to account for a particular setup
                 that is used to present the content, for example,
                 viewing distance or screen size. This creates
                 additional challenges in the context of personal use
                 when stereoscopic content is shown on TV sets, desktop
                 monitors, or mobile devices. We address this problem by
                 presenting a new system for streaming stereoscopic
                 content. Its key feature is a computationally efficient
                 depth adjustment technique which can automatically
                 optimize viewing experience for videos of field sports
                 such as soccer, football, and tennis. Additionally, the
                 method enables depth personalization to allow users to
                 adjust the amount of depth according to their
                 preferences. Our stereoscopic video streaming system
                 was implemented, deployed, and tested with real
                 users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2016:ERM,
  author =       "Qiong Wu and Pierre Boulanger",
  title =        "Enhanced Reweighted {MRFs} for Efficient Fashion Image
                 Parsing",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "42:1--42:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890104",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Previous image parsing methods usually model the
                 problem in a conditional random field which describes a
                 statistical model learned from a training dataset and
                 then processes a query image using the conditional
                 probability. However, for clothing images, fashion
                 items have a large variety of layering and
                 configuration, and it is hard to learn a certain
                 statistical model of features that apply to general
                 cases. In this article, we take fashion images as an
                 example to show how Markov Random Fields (MRFs) can
                 outperform Conditional Random Fields when the
                 application does not follow a certain statistical model
                 learned from the training data set. We propose a new
                 method for automatically parsing fashion images in high
                 processing efficiency with significantly less training
                 time by applying a modification of MRFs, named
                 reweighted MRF (RW-MRF), which resolves the problem of
                 over smoothing infrequent labels. We further enhance
                 RW-MRF with occlusion prior and background prior to
                 resolve two other common problems in clothing parsing,
                 occlusion, and background spill. Our experimental
                 results indicate that our proposed clothing parsing
                 method significantly improves processing time and
                 training time over state-of-the-art methods, while
                 ensuring comparable parsing accuracy and improving
                 label recall rate.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2016:ADA,
  author =       "Yao Hu and Chen Zhao and Deng Cai and Xiaofei He and
                 Xuelong Li",
  title =        "Atom Decomposition with Adaptive Basis Selection
                 Strategy for Matrix Completion",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "43:1--43:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903716",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Estimating missing entries in matrices has attracted
                 much attention due to its wide range of applications
                 like image inpainting and video denoising, which are
                 usually considered as low-rank matrix completion
                 problems theoretically. It is common to consider
                 nuclear norm as a surrogate of the rank operator since
                 it is the tightest convex lower bound of the rank
                 operator under certain conditions. However, most
                 approaches based on nuclear norm minimization involve a
                 number of singular value decomposition (SVD)
                 operations. Given a matrix $ X \in R^{m \times n} $,
                 the time complexity of the SVD operation is $ O(m n^2)
                 $, which brings prohibitive computational burden on
                 large-scale matrices, limiting the further usage of
                 these methods in real applications. Motivated by this
                 observation, a series of atom-decomposition-based
                 matrix completion methods have been studied. The key to
                 these methods is to reconstruct the target matrix by
                 pursuit methods in a greedy way, which only involves
                 the computation of the top SVD and has great advantages
                 in efficiency compared with the SVD-based matrix
                 completion methods. However, due to gradually serious
                 accumulation errors, atom-decomposition-based methods
                 usually result in unsatisfactory reconstruction
                 accuracy. In this article, we propose a new efficient
                 and scalable atom decomposition algorithm for matrix
                 completion called Adaptive Basis Selection Strategy
                 (ABSS). Different from traditional greedy atom
                 decomposition methods, a two-phase strategy is
                 conducted to generate the basis separately via
                 different strategies according to their different
                 nature. At first, we globally prune the basis space to
                 eliminate the unimportant basis as much as possible and
                 locate the probable subspace containing the most
                 informative basis. Then, another group of basis spaces
                 are learned to improve the recovery accuracy based on
                 local information. In this way, our proposed algorithm
                 breaks through the accuracy bottleneck of traditional
                 atom-decomposition-based matrix completion methods;
                 meanwhile, it reserves the innate efficiency advantages
                 over SVD-based matrix completion methods. We
                 empirically evaluate the proposed algorithm ABSS on
                 real visual image data and large-scale recommendation
                 datasets. Results have shown that ABSS has much better
                 reconstruction accuracy with comparable cost to
                 atom-decomposition-based methods. At the same time, it
                 outperforms the state-of-the-art SVD-based matrix
                 completion algorithms by similar or better
                 reconstruction accuracy with enormous advantages on
                 efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Miao:2016:HFL,
  author =       "Dan Miao and Jingjing Fu and Yan Lu and Shipeng Li and
                 Chang Wen Chen",
  title =        "A High-Fidelity and Low-Interaction-Delay Screen
                 Sharing System",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "44:1--44:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2897395",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The pervasive computing environment and wide network
                 bandwidth provide users more opportunities to share
                 screen content among multiple devices. In this article,
                 we introduce a remote display system to enable screen
                 sharing among multiple devices with high fidelity and
                 responsive interaction. In the developed system, the
                 frame-level screen content is compressed and
                 transmitted to the client side for screen sharing, and
                 the instant control inputs are simultaneously
                 transmitted to the server side for interaction. Even if
                 the screen responds immediately to the control messages
                 and updates at a high frame rate on the server side, it
                 is difficult to update the screen content with low
                 delay and high frame rate in the client side due to
                 non-negligible time consumption on the whole screen
                 frame compression, transmission, and display buffer
                 updating. To address this critical problem, we propose
                 a layered structure for screen coding and rendering to
                 deliver diverse screen content to the client side with
                 an adaptive frame rate. More specifically, the
                 interaction content with small region screen update is
                 compressed by a blockwise screen codec and rendered at
                 a high frame rate to achieve smooth interaction, while
                 the natural video screen content is compressed by
                 standard video codec and rendered at a regular frame
                 rate for a smooth video display. Experimental results
                 with real applications demonstrate that the proposed
                 system can successfully reduce transmission bandwidth
                 cost and interaction delay during screen sharing.
                 Especially for user interaction in small regions, the
                 proposed system can achieve a higher frame rate than
                 most previous counterparts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wilk:2016:CAV,
  author =       "Stefan Wilk and Stephan Kopf and Wolfgang Effelsberg",
  title =        "Collaborative Annotation of Videos Relying on Weak
                 Consistency",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "45:1--45:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2907983",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This work discusses a distributed interactive video
                 system that supports video annotation using
                 simultaneous hyperlinking by multiple users. The users
                 mark and annotate objects within the video with links
                 to other media such as text, images, websites, or other
                 videos. Annotations are visualized on the client user
                 interface as an overlay close to the objects. Our
                 system is intuitive to use; for example, it contains
                 automatic object-tracking functionality that correctly
                 positions the annotations, even when the form or
                 location of an object changes. Thus, our first
                 contribution discusses the adaptive object-tracking
                 algorithm used for this repositioning. It shows
                 improved precision and reliability in comparison to
                 nonadaptive algorithms. A second key issue is to keep
                 the system responsive when the number of concurrent
                 annotators increases. Thus, we rely on the concept of
                 eventual consistency between different network
                 entities. While this weak form of consistency allows
                 temporary inconsistencies, it ensures that a consistent
                 state can be reached. Thus, the second contribution is
                 the design and evaluation of our distributed
                 interactive video system, which relies on the weak
                 consistency paradigm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Merani:2016:ASP,
  author =       "Maria Luisa Merani and Laura Natali",
  title =        "Adaptive Streaming in {P2P} Live Video Systems: a
                 Distributed Rate Control Approach",
  journal =      j-TOMM,
  volume =       "12",
  number =       "3",
  pages =        "46:1--46:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2912123",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 16 09:38:16 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Dynamic Adaptive Streaming over HTTP (DASH) is a
                 recently proposed standard that offers different
                 versions of the same media content to adapt the
                 delivery process over the Internet to dynamic bandwidth
                 fluctuations and different user device capabilities.
                 The peer-to-peer (P2P) paradigm for video streaming
                 allows us to leverage the cooperation among peers,
                 guaranteeing the service of video requests with
                 increased scalability and reduced cost. We propose to
                 combine these two approaches in a P2P-DASH
                 architecture, exploiting the potentiality of both. The
                 new platform is made of several swarms and a different
                 DASH representation is streamed within each of them;
                 unlike client-server DASH architectures, where each
                 client autonomously selects which version to download
                 according to current network conditions and to its
                 device resources, we put forth a new rate control
                 strategy implemented at peer site to maintain a good
                 viewing quality to the local user and to simultaneously
                 guarantee the successful operation of the P2P swarms.
                 The effectiveness of the solution is demonstrated
                 through simulation and it indicates that the P2P-DASH
                 platform is able to provide its users with very good
                 performance, much more satisfying than in a
                 conventional P2P environment where DASH is not
                 employed. Through a comparison with a reference DASH
                 system modeled via the Integer Linear Programming (ILP)
                 approach, the new system is shown to outperform such
                 reference architecture. To further validate the
                 proposal, in terms of both robustness and scalability,
                 system behavior is investigated in the critical
                 condition of a flash crowd, showing that the strong
                 upsurge of new users can be successfully revealed and
                 gradually accommodated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jia:2016:WGB,
  author =       "Adele Lu Jia and Siqi Shen and Dick H. J. Epema and
                 Alexandru Iosup",
  title =        "When Game Becomes Life: The Creators and Spectators of
                 Online Game Replays and Live Streaming",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "47:1--47:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957750",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online gaming franchises such as World of Tanks,
                 Defense of the Ancients, and StarCraft have attracted
                 hundreds of millions of users who, apart from playing
                 the game, also socialize with each other through gaming
                 and viewing gamecasts. As a form of User Generated
                 Content (UGC), gamecasts play an important role in user
                 entertainment and gamer education. They deserve the
                 attention of both industrial partners and the academic
                 communities, corresponding to the large amount of
                 revenue involved and the interesting research problems
                 associated with UGC sites and social networks. Although
                 previous work has put much effort into analyzing
                 general UGC sites such as YouTube, relatively little is
                 known about the gamecast sharing sites. In this work,
                 we provide the first comprehensive study of gamecast
                 sharing sites, including commercial streaming-based
                 sites such as Amazon's Twitch.tv and
                 community-maintained replay-based sites such as
                 WoTreplays. We collect and share a novel dataset on
                 WoTreplays that includes more than 380,000 game
                 replays, shared by more than 60,000 creators with more
                 than 1.9 million gamers. Together with an earlier
                 published dataset on Twitch.tv, we investigate basic
                 characteristics of gamecast sharing sites, and we
                 analyze the activities of their creators and
                 spectators. Among our results, we find that (i)
                 WoTreplays and Twitch.tv are both fast-consumed
                 repositories, with millions of gamecasts being
                 uploaded, viewed, and soon forgotten; (ii) both the
                 gamecasts and the creators exhibit highly skewed
                 popularity, with a significant heavy tail phenomenon;
                 and (iii) the upload and download preferences of
                 creators and spectators are different: while the
                 creators emphasize their individual skills, the
                 spectators appreciate team-wise tactics. Our findings
                 provide important knowledge for infrastructure and
                 service improvement, for example, in the design of
                 proper resource allocation mechanisms that consider
                 future gamecasting and in the tuning of incentive
                 policies that further help player retention.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Rana:2016:DBV,
  author =       "Shuvendu Rana and Arijit Sur",
  title =        "Depth-Based View-Invariant Blind {$3$D} Image
                 Watermarking",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "48:1--48:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957751",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the huge advance in Internet technology as well
                 as the availability of low-cost 3D display devices, 3D
                 image transmission has become popular in recent times.
                 Since watermarking has become regarded as a potential
                 Digital Rights Management (DRM) tools in the past
                 decade, 3D image watermarking is an emerging research
                 topic. With the introduction of the Depth Image-Based
                 Rendering (DIBR) technique, 3D image watermarking is a
                 more challenging task, especially for synthetic view
                 generation. In this article, synthetic view generation
                 is regarded as a potential attack, and a blind
                 watermarking scheme is proposed that can resist it. In
                 the proposed scheme, the watermark is embedded into the
                 low-pass filtered dependent view region of 3D images.
                 Block Discrete Cosine Transformation (DCT) is used for
                 spatial-filtration of the dependent view region to find
                 the DC coefficient with horizontally shifted coherent
                 regions from the left and right view to make the scheme
                 robust against synthesis view attack. A comprehensive
                 set of experiments have been carried out to justify the
                 robustness of the proposed scheme over related existing
                 schemes with respect to Stereo JPEG compression and
                 different noise addition attacks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Silva:2016:MIB,
  author =       "Bruno M. C. Silva and Joel J. P. C. Rodrigues and
                 Neeraj Kumar Mario L. {Proen{\c{c}}a, Jr.} and Guangjie
                 Han",
  title =        "{MobiCoop}: an Incentive-Based Cooperation Solution
                 for Mobile Applications",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "49:1--49:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957752",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Network architectures based on mobile devices and
                 wireless communications present several constraints
                 (e.g., processor, energy storage, bandwidth, etc.) that
                 affect the overall network performance. Cooperation
                 strategies have been considered as a solution to
                 address these network limitations. In the presence of
                 unstable network infrastructures, mobile nodes
                 cooperate with each other, forwarding data and
                 performing other specific network functionalities. This
                 article proposes a generalized incentive-based
                 cooperation solution for mobile services and
                 applications called MobiCoop. This reputation-based
                 scheme includes an application framework for mobile
                 applications that uses a Web service to handle all the
                 nodes reputation and network permissions. The main goal
                 of MobiCoop is to provide Internet services to mobile
                 devices without network connectivity through
                 cooperation with neighbor devices. The article includes
                 a performance evaluation study of MobiCoop considering
                 both a real scenario (using a prototype) and a
                 simulation-based study. Results show that the proposed
                 approach provides network connectivity independency to
                 users with mobile apps when Internet connectivity is
                 unavailable. Then, it is concluded that MobiCoop
                 improved significantly the overall system performance
                 and the service provided for a given mobile
                 application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shivani:2016:PVC,
  author =       "Shivendra Shivani and Suneeta Agarwal",
  title =        "Progressive Visual Cryptography with Unexpanded
                 Meaningful Shares",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "50:1--50:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2935618",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The traditional $k$-out-of-$n$ Visual Cryptography
                 (VC) scheme is the conception of ``all or nothing'' for
                 $n$ participants to share a secret image. The original
                 secret image can be visually revealed only when a
                 subset of $k$ or more shares are superimposed together,
                 but if the number of stacked shares are less than $k$,
                 nothing will be revealed. On the other hand, a
                 Progressive Visual Cryptography (PVC) scheme differs
                 from the traditional VC with respect to decoding. In
                 PVC, clarity and contrast of the decoded secret image
                 will be increased progressively with the number of
                 stacked shares. Much of the existing state-of-the-art
                 research on PVC has problems with pixel expansion and
                 random pattern of the shares. In this article, a novel
                 scheme of progressive visual cryptography with four or
                 more number of unexpanded as well as meaningful shares
                 has been proposed. For this, a novel and efficient
                 Candidate Block Replacement preprocessing approach and
                 a basis matrix creation algorithm have also been
                 introduced. The proposed method also eliminates many
                 unnecessary encryption constraints like a predefined
                 codebook for encoding and decoding the secret image,
                 restriction on the number of participants, and so on.
                 From the experiments, it is observed that the
                 reconstruction probability of black pixels in the
                 decoded image corresponding to the black pixel in the
                 secret image is always 1, whereas that of white pixels
                 is 0.5 irrespective of the meaningful contents visible
                 in the shares, thus ensuring the value of contrast to
                 always be 50\%. Therefore, a reconstructed image can be
                 easily identified by a human visual system without any
                 computation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ademoye:2016:AME,
  author =       "Oluwakemi A. Ademoye and Niall Murray and Gabriel-Miro
                 Muntean and Gheorghita Ghinea",
  title =        "Audio Masking Effect on Inter-Component Skews in
                 Olfaction-Enhanced Multimedia Presentations",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "51:1--51:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957753",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Media-rich content plays a vital role in consumer
                 applications today, as these applications try to find
                 new and interesting ways to engage their users. Video,
                 audio, and the more traditional forms of media content
                 continue to dominate with respect to the use of media
                 content to enhance the user experience. Tactile
                 interactivity has also now become widely popular in
                 modern computing applications, while our olfactory and
                 gustatory senses continue to have a limited role.
                 However, in recent times, there have been significant
                 advancements regarding the use of olfactory media
                 content (i.e., smell), and there are a variety of
                 devices now available to enable its computer-controlled
                 emission. This paper explores the impact of the audio
                 stream on user perception of olfactory-enhanced video
                 content in the presence of skews between the olfactory
                 and video media. This research uses the results from
                 two experimental studies of user-perceived quality of
                 olfactory-enhanced multimedia, where audio was present
                 and absent, respectively. Specifically, the paper shows
                 that the user Quality of Experience (QoE) is generally
                 higher in the absence of audio for nearly perfect
                 synchronized olfactory-enhanced multimedia
                 presentations (i.e., an olfactory media skew of between
                 {-10,+10s}); however, for greater olfactory media skews
                 (ranging between {-30s;-10s} and {+10s, +30s}) user QoE
                 is higher when the audio stream is present. It can be
                 concluded that the presence of the audio has the
                 ability to mask larger synchronization skews between
                 the other media components in olfaction-enhanced
                 multimedia presentations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhong:2016:FED,
  author =       "Sheng-Hua Zhong and Yan Liu and Kien A. Hua",
  title =        "Field Effect Deep Networks for Image Recognition with
                 Incomplete Data",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "52:1--52:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957754",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Image recognition with incomplete data is a well-known
                 hard problem in computer vision and machine learning.
                 This article proposes a novel deep learning technique
                 called Field Effect Bilinear Deep Networks (FEBDN) for
                 this problem. To address the difficulties of
                 recognizing incomplete data, we design a novel
                 second-order deep architecture with the Field Effect
                 Restricted Boltzmann Machine, which models the
                 reliability of the delivered information according to
                 the availability of the features. Based on this new
                 architecture, we propose a new three-stage learning
                 procedure with field effect bilinear initialization,
                 field effect abstraction and estimation, and global
                 fine-tuning with missing features adjustment. By
                 integrating the reliability of features into the new
                 learning procedure, the proposed FEBDN can jointly
                 determine the classification boundary and estimate the
                 missing features. FEBDN has demonstrated impressive
                 performance on recognition and estimation tasks in
                 various standard datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yan:2016:UVR,
  author =       "Ming Yan and Jitao Sang and Changsheng Xu and M.
                 Shamim Hossain",
  title =        "A Unified Video Recommendation by Cross-Network User
                 Modeling",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "53:1--53:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2957755",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online video sharing sites are increasingly
                 encouraging their users to connect to the social
                 network venues such as Facebook and Twitter, with goals
                 to boost user interaction and better disseminate the
                 high-quality video content. This in turn provides huge
                 possibilities to conduct cross-network collaboration
                 for personalized video recommendation. However, very
                 few efforts have been devoted to leveraging users'
                 social media profiles in the auxiliary network to
                 capture and personalize their video preferences, so as
                 to recommend videos of interest. In this article, we
                 propose a unified YouTube video recommendation solution
                 by transferring and integrating users' rich social and
                 content information in Twitter network. While general
                 recommender systems often suffer from typical problems
                 like cold-start and data sparsity, our proposed
                 recommendation solution is able to effectively learn
                 from users' abundant auxiliary information on Twitter
                 for enhanced user modeling and well address the typical
                 problems in a unified framework. In this framework, two
                 stages are mainly involved: (1) auxiliary-network data
                 transfer, where user preferences are transferred from
                 an auxiliary network by learning cross-network
                 knowledge associations; and (2) cross-network data
                 integration, where transferred user preferences are
                 integrated with the observed behaviors on a target
                 network in an adaptive fashion. Experimental results
                 show that the proposed cross-network collaborative
                 solution achieves superior performance not only in
                 terms of accuracy, but also in improving the diversity
                 and novelty of the recommended videos.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jiang:2016:CVI,
  author =       "Yijing Jiang and Shanyu Tang and Liping Zhang and
                 Muzhou Xiong and Yau Jim Yip",
  title =        "Covert Voice over {Internet} Protocol Communications
                 with Packet Loss Based on Fractal Interpolation",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "54:1--54:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2961053",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The last few years have witnessed an explosive growth
                 in the research of information hiding in multimedia
                 objects, but few studies have taken into account packet
                 loss in multimedia networks. As one of the most popular
                 real-time services in the Internet, Voice over Internet
                 Protocol (VoIP) contributes to a large part of network
                 traffic for its advantages of real time, high flow, and
                 low cost. So packet loss is inevitable in multimedia
                 networks and affects the performance of VoIP
                 communications. In this study, a fractal-based VoIP
                 steganographic approach was proposed to realize covert
                 VoIP communications in the presence of packet loss. In
                 the proposed scheme, secret data to be hidden were
                 divided into blocks after being encrypted with the
                 block cipher, and each block of the secret data was
                 then embedded into VoIP streaming packets. The VoIP
                 packets went through a packet-loss system based on
                 Gilbert model which simulates a real network situation.
                 And a prediction model based on fractal interpolation
                 was built to decide whether a VoIP packet was suitable
                 for data hiding. The experimental results indicated
                 that the speech quality degradation increased with the
                 escalating packet-loss level. The average variance of
                 speech quality metrics (PESQ score) between the
                 ``no-embedding'' speech samples and the
                 ``with-embedding'' stego-speech samples was about
                 0.717, and the variances narrowed with the increasing
                 packet-loss level. Both the average PESQ scores and the
                 SNR values of stego-speech samples and the
                 data-retrieving rates had almost the same varying
                 trends when the packet-loss level increased, indicating
                 that the success rate of the fractal prediction model
                 played an important role in the performance of covert
                 VoIP communications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2016:SFM,
  author =       "Xiaoshan Yang and Tianzhu Zhang and Changsheng Xu",
  title =        "Semantic Feature Mining for Video Event
                 Understanding",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "55:1--55:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2962719",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Content-based video understanding is extremely
                 difficult due to the semantic gap between low-level
                 vision signals and the various semantic concepts
                 (object, action, and scene) in videos. Though feature
                 extraction from videos has achieved significant
                 progress, most of the previous methods rely only on
                 low-level features, such as the appearance and motion
                 features. Recently, visual-feature extraction has been
                 improved significantly with machine-learning
                 algorithms, especially deep learning. However, there is
                 still not enough work focusing on extracting semantic
                 features from videos directly. The goal of this article
                 is to adopt unlabeled videos with the help of text
                 descriptions to learn an embedding function, which can
                 be used to extract more effective semantic features
                 from videos when only a few labeled samples are
                 available for video recognition. To achieve this goal,
                 we propose a novel embedding convolutional neural
                 network (ECNN). We evaluate our algorithm by comparing
                 its performance on three challenging benchmarks with
                 several popular state-of-the-art methods. Extensive
                 experimental results show that the proposed ECNN
                 consistently and significantly outperforms the existing
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Nilsson:2016:ASD,
  author =       "Tommy Nilsson and Carl Hogsden and Charith Perera and
                 Saeed Aghaee and David J. Scruton and Andreas Lund and
                 Alan F. Blackwell",
  title =        "Applying Seamful Design in Location-Based Mobile
                 Museum Applications",
  journal =      j-TOMM,
  volume =       "12",
  number =       "4",
  pages =        "56:1--56:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2962720",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 25 07:28:05 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The application of mobile computing is currently
                 altering patterns of our behavior to a greater degree
                 than perhaps any other invention. In combination with
                 the introduction of power-efficient wireless
                 communication technologies, such as Bluetooth Low
                 Energy (BLE), designers are today increasingly
                 empowered to shape the way we interact with our
                 physical surroundings and thus build entirely new
                 experiences. However, our evaluations of BLE and its
                 abilities to facilitate mobile location-based
                 experiences in public environments revealed a number of
                 potential problems. Most notably, the position and
                 orientation of the user in combination with various
                 environmental factors, such as crowds of people
                 traversing the space, were found to cause major
                 fluctuations of the received BLE signal strength. These
                 issues are rendering a seamless functioning of any
                 location-based application practically impossible.
                 Instead of achieving seamlessness by eliminating these
                 technical issues, we thus choose to advocate the use of
                 a seamful approach, that is, to reveal and exploit
                 these problems and turn them into a part of the actual
                 experience. In order to demonstrate the viability of
                 this approach, we designed, implemented, and evaluated
                 the Ghost Detector -an educational location-based
                 museum game for children. By presenting a qualitative
                 evaluation of this game and by motivating our design
                 decisions, this article provides insight into some of
                 the challenges and possible solutions connected to the
                 process of developing location-based BLE-enabled
                 experiences for public cultural spaces.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yan:2017:LCI,
  author =       "Zheng Yan",
  title =        "Learning from Collective Intelligence: Feature
                 Learning Using Social Images and Tags",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2978656",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Feature representation for visual content is the key
                 to the progress of many fundamental applications such
                 as annotation and cross-modal retrieval. Although
                 recent advances in deep feature learning offer a
                 promising route towards these tasks, they are limited
                 in application domains where high-quality and
                 large-scale training data are expensive to obtain. In
                 this article, we propose a novel deep feature learning
                 paradigm based on social collective intelligence, which
                 can be acquired from the inexhaustible social
                 multimedia content on the Web, in particular, largely
                 social images and tags. Differing from existing feature
                 learning approaches that rely on high-quality
                 image-label supervision, our weak supervision is
                 acquired by mining the visual-semantic embeddings from
                 noisy, sparse, and diverse social image collections.
                 The resultant image-word embedding space can be used to
                 (1) fine-tune deep visual models for low-level feature
                 extractions and (2) seek sparse representations as
                 high-level cross-modal features for both image and
                 text. We offer an easy-to-use implementation for the
                 proposed paradigm, which is fast and compatible with
                 any state-of-the-art deep architectures. Extensive
                 experiments on several benchmarks demonstrate that the
                 cross-modal features learned by our paradigm
                 significantly outperforms others in various
                 applications such as content-based retrieval,
                 classification, and image captioning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheung:2017:PVT,
  author =       "Ming Cheung and James She and Alvin Junus and Lei
                 Cao",
  title =        "Prediction of Virality Timing Using Cascades in Social
                 Media",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2978771",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Predicting content going viral in social networks is
                 attractive for viral marketing, advertisement,
                 entertainment, and other applications, but it remains a
                 challenge in the big data era today. Previous works
                 mainly focus on predicting the possible popularity of
                 content rather than the timing of reaching such
                 popularity. This work proposes a novel yet practical
                 iterative algorithm to predict virality timing, in
                 which the correlation between the timing and growth of
                 content popularity is captured by using its own big
                 data naturally generated from users' sharing. Such data
                 is not only able to correlate the dynamics and
                 associated timings in social cascades of viral content
                 but also can be useful to self-correct the predicted
                 timing against the actual timing of the virality in
                 each iterative prediction. The proposed prediction
                 algorithm is verified by datasets from two popular
                 social networks-Twitter and Digg-as well as two
                 synthesized datasets with extreme network densities and
                 infection rates. With about 50\% of the required
                 content virality data available (i.e., halfway before
                 reaching its actual virality timing), the error of the
                 predicted timing is proven to be bounded within a 40\%
                 deviation from the actual timing. To the best of our
                 knowledge, this is the first work that predicts content
                 virality timing iteratively by capturing social
                 cascades dynamics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chiu:2017:AAS,
  author =       "Chih-Yi Chiu and Yu-Cyuan Liou and Amorntip
                 Prayoonwong",
  title =        "Approximate Asymmetric Search for Binary Embedding
                 Codes",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990504",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose a method of approximate
                 asymmetric nearest-neighbor search for binary embedding
                 codes. The asymmetric distance takes advantage of less
                 information loss at the query side. However,
                 calculating asymmetric distances through exhaustive
                 search is prohibitive in a large-scale dataset. We
                 present a novel method, called multi-index voting, that
                 integrates the multi-index hashing technique with a
                 voting mechanism to select appropriate candidates and
                 calculate their asymmetric distances. We show that the
                 candidate selection scheme can be formulated as the
                 tail of the binomial distribution function. In
                 addition, a binary feature selection method based on
                 minimal quantization error is proposed to address the
                 memory insufficiency issue and improve the search
                 accuracy. Substantial experimental evaluations were
                 made to demonstrate that the proposed method can yield
                 an approximate accuracy to the exhaustive search method
                 while significantly accelerating the runtime. For
                 example, one result shows that in a dataset of one
                 billion 256-bit binary codes, examining only 0.5\% of
                 the dataset, can reach 95--99\% close accuracy to the
                 exhaustive search method and accelerate the search by
                 73--128 times. It also demonstrates an excellent
                 tradeoff between the search accuracy and time
                 efficiency compared to the state-of-the-art
                 nearest-neighbor search methods. Moreover, the proposed
                 feature selection method shows its effectiveness and
                 improves the accuracy up to 8.35\% compared with other
                 feature selection methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Miller:2017:QBL,
  author =       "Konstantin Miller and Abdel-Karim Al-Tamimi and Adam
                 Wolisz",
  title =        "{QoE}-Based Low-Delay Live Streaming Using Throughput
                 Predictions",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990505",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Recently, Hypertext Transfer Protocol (HTTP)-based
                 adaptive streaming has become the de facto standard for
                 video streaming over the Internet. It allows clients to
                 dynamically adapt media characteristics to the varying
                 network conditions to ensure a high quality of
                 experience (QoE)-that is, minimize playback
                 interruptions while maximizing video quality at a
                 reasonable level of quality changes. In the case of
                 live streaming, this task becomes particularly
                 challenging due to the latency constraints. The
                 challenge further increases if a client uses a wireless
                 access network, where the throughput is subject to
                 considerable fluctuations. Consequently, live streams
                 often exhibit latencies of up to 20 to 30 seconds. In
                 the present work, we introduce an adaptation algorithm
                 for HTTP-based live streaming called LOLYPOP (short for
                 low-latency prediction-based adaptation), which is
                 designed to operate with a transport latency of a few
                 seconds. To reach this goal, LOLYPOP leverages
                 Transmission Control Protocol throughput predictions on
                 multiple time scales, from 1 to 10 seconds, along with
                 estimations of the relative prediction error
                 distributions. In addition to satisfying the latency
                 constraint, the algorithm heuristically maximizes the
                 QoE by maximizing the average video quality as a
                 function of the number of skipped segments and quality
                 transitions. To select an efficient prediction method,
                 we studied the performance of several time series
                 prediction methods in IEEE 802.11 wireless access
                 networks. We evaluated LOLYPOP under a large set of
                 experimental conditions, limiting the transport latency
                 to 3 seconds, against a state-of-the-art adaptation
                 algorithm called FESTIVE. We observed that the average
                 selected video representation index is by up to a
                 factor of 3 higher than with the baseline approach. We
                 also observed that LOLYPOP is able to reach points from
                 a broader region in the QoE space, and thus it is
                 better adjustable to the user profile or service
                 provider requirements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ranasinghe:2017:DLS,
  author =       "Nimesha Ranasinghe and Ellen Yi-Luen Do",
  title =        "Digital Lollipop: Studying Electrical Stimulation on
                 the Human Tongue to Simulate Taste Sensations",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996462",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Among the five primary senses, the sense of taste is
                 the least explored as a form of digital media applied
                 in Human--Computer Interface. This article presents an
                 experimental instrument, the Digital Lollipop, for
                 digitally simulating the sensation of taste (gustation)
                 by utilizing electrical stimulation on the human
                 tongue. The system is capable of manipulating the
                 properties of electric currents (magnitude, frequency,
                 and polarity) to formulate different stimuli. To
                 evaluate the effectiveness of this method, the system
                 was experimentally tested in two studies. The first
                 experiment was conducted using separate regions of the
                 human tongue to record occurrences of basic taste
                 sensations and their respective intensity levels. The
                 results indicate occurrences of sour, salty, bitter,
                 and sweet sensations from different regions of the
                 tongue. One of the major discoveries of this experiment
                 was that the sweet taste emerges via an inverse-current
                 mechanism, which deserves further research in the
                 future. The second study was conducted to compare
                 natural and artificial (virtual) sour taste sensations
                 and examine the possibility of effectively controlling
                 the artificial sour taste at three intensity levels
                 (mild, medium, and strong). The proposed method is
                 attractive since it does not require any chemical
                 solutions and facilitates further research
                 opportunities in several directions including
                 human--computer interaction, virtual reality, food and
                 beverage, as well as medicine.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Min:2017:FPT,
  author =       "Xiongkuo Min and Guangtao Zhai and Ke Gu and Xiaokang
                 Yang",
  title =        "Fixation Prediction through Multimodal Analysis",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose to predict human eye
                 fixation through incorporating both audio and visual
                 cues. Traditional visual attention models generally
                 make the utmost of stimuli's visual features, yet they
                 bypass all audio information. In the real world,
                 however, we not only direct our gaze according to
                 visual saliency, but also are attracted by salient
                 audio cues. Psychological experiments show that audio
                 has an influence on visual attention, and subjects tend
                 to be attracted by the sound sources. Therefore, we
                 propose fusing both audio and visual information to
                 predict eye fixation. In our proposed framework, we
                 first localize the moving--sound-generating objects
                 through multimodal analysis and generate an audio
                 attention map. Then, we calculate the spatial and
                 temporal attention maps using the visual modality.
                 Finally, the audio, spatial, and temporal attention
                 maps are fused to generate the final audiovisual
                 saliency map. The proposed method is applicable to
                 scenes containing moving--sound-generating objects. We
                 gather a set of video sequences and collect
                 eye-tracking data under an audiovisual test condition.
                 Experiment results show that we can achieve better eye
                 fixation prediction performance when taking both audio
                 and visual cues into consideration, especially in some
                 typical scenes in which object motion and audio are
                 highly correlated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chu:2017:POI,
  author =       "Wei-Ta Chu and Chih-Hao Chiu",
  title =        "Predicting Occupation from Images by Combining Face
                 and Body Context Information",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3009911",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Facial images embed age, gender, and other rich
                 information that is implicitly related to occupation.
                 In this work, we advocate that occupation prediction
                 from a single facial image is a doable computer vision
                 problem. We extract multilevel hand-crafted features
                 associated with locality-constrained linear coding and
                 convolutional neural network features as image
                 occupation descriptors. To avoid the curse of
                 dimensionality and overfitting, a boost strategy called
                 multichannel SVM is used to integrate features from
                 face and body. Intra- and interclass visual variations
                 are jointly considered in the boosting framework to
                 further improve performance. In the evaluation, we
                 verify the effectiveness of predicting occupation from
                 face and demonstrate promising performance obtained by
                 combining face and body information. More importantly,
                 our work further integrates deep features into the
                 multichannel SVM framework and shows significantly
                 better performance over the state of the art.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Xu:2017:CSA,
  author =       "Jingxi Xu and Benjamin W. Wah",
  title =        "Consistent Synchronization of Action Order with Least
                 Noticeable Delays in Fast-Paced Multiplayer Online
                 Games",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003727",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "When running multiplayer online games on IP networks
                 with losses and delays, the order of actions may be
                 changed when compared to the order run on an ideal
                 network with no delays and losses. To maintain a proper
                 ordering of events, traditional approaches either use
                 rollbacks to undo certain actions or local lags to
                 introduce additional delays. Both may be perceived by
                 players because their changes are beyond the
                 just-noticeable-difference (JND) threshold. In this
                 article, we propose a novel method for ensuring a
                 strongly consistent completion order of actions, where
                 strong consistency refers to the same completion order
                 as well as the same interval between any completion
                 time and the corresponding ideal reference completion
                 time under no network delay. We find that small
                 adjustments within the JND on the duration of an action
                 would not be perceivable, as long as the duration is
                 comparable to the network round-trip time. We utilize
                 this property to control the vector of durations of
                 actions and formulate the search of the vector as a
                 multidimensional optimization problem. By using the
                 property that players are generally more sensitive to
                 the most prominent delay effect (with the highest
                 probability of noticeability P$_{notice}$ or the
                 probability of correctly noticing a change when
                 compared to the reference), we prove that the optimal
                 solution occurs when P$_{notice}$ of the individual
                 adjustments are equal. As this search can be done
                 efficiently in polynomial time ($\approx$ 5ms) with a
                 small amount of space ($\approx$ 160KB), the search can
                 be done at runtime to determine the optimal
                 control. Last, we evaluate our approach on the popular
                 open-source online shooting game BZFlag.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Schramm:2017:ATS,
  author =       "Rodrigo Schramm and Helena {De Souza Nunes} and
                 Cl{\'a}udio Rosito Jung",
  title =        "Audiovisual Tool for {Solf{\`e}ge} Assessment",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007194",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Solf{\`e}ge is a general technique used in the music
                 learning process that involves the vocal performance of
                 melodies, regarding the time and duration of musical
                 sounds as specified in the music score, properly
                 associated with the meter-mimicking performed by hand
                 movement. This article presents an audiovisual approach
                 for automatic assessment of this relevant musical study
                 practice. The proposed system combines the gesture of
                 meter-mimicking (video information) with the melodic
                 transcription (audio information), where hand movement
                 works as a metronome, controlling the time flow (tempo)
                 of the musical piece. Thus, meter-mimicking is used to
                 align the music score (ground truth) with the sung
                 melody, allowing assessment even in time-dynamic
                 scenarios. Audio analysis is applied to achieve the
                 melodic transcription of the sung notes and the
                 solf{\`e}ge performances are evaluated by a set of
                 Bayesian classifiers that were generated from real
                 evaluations done by experts listeners.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2017:IRS,
  author =       "Haojun Wu and Yong Wang and Jiwu Huang",
  title =        "Identification of Reconstructed Speech",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3004055",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Both voice conversion and hidden Markov model-- (HMM)
                 based speech synthesis can be used to produce
                 artificial voices of a target speaker. They have shown
                 great negative impacts on speaker verification (SV)
                 systems. In order to enhance the security of SV
                 systems, the techniques to detect converted/synthesized
                 speech should be taken into consideration. During voice
                 conversion and HMM-based synthesis, speech
                 reconstruction is applied to transform a set of
                 acoustic parameters to reconstructed speech. Hence, the
                 identification of reconstructed speech can be used to
                 distinguish converted/synthesized speech from human
                 speech. Several related works on such identification
                 have been reported. The equal error rates (EERs) lower
                 than 5\% of detecting reconstructed speech have been
                 achieved. However, through the cross-database
                 evaluations on different speech databases, we find that
                 the EERs of several testing cases are higher than 10\%.
                 The robustness of detection algorithms to different
                 speech databases needs to be improved. In this article,
                 we propose an algorithm to identify the reconstructed
                 speech. Three different speech databases and two
                 different reconstruction methods are considered in our
                 work, which has not been addressed in the reported
                 works. The high-dimensional data visualization approach
                 is used to analyze the effect of speech reconstruction
                 on Mel-frequency cepstral coefficients (MFCC) of speech
                 signals. The Gaussian mixture model supervectors of
                 MFCC are used as acoustic features. Furthermore, a set
                 of commonly used classification algorithms are applied
                 to identify reconstructed speech. According to the
                 comparison among different classification methods,
                 linear discriminant analysis-ensemble classifiers are
                 chosen in our algorithm. Extensive experimental results
                 show that the EERs lower than 1\% can be achieved by
                 the proposed algorithm in most cases, outperforming the
                 reported state-of-the-art identification techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gaj:2017:DCR,
  author =       "Sibaji Gaj and Aditya Kanetkar and Arijit Sur and
                 Prabin Kumar Bora",
  title =        "Drift-Compensated Robust Watermarking Algorithm for
                 {H.265\slash HEVC} Video Stream",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3009910",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "It has been observed in the recent literature that the
                 drift error due to watermarking degrades the visual
                 quality of the embedded video. The existing drift error
                 handling strategies for recent video standards such as
                 H.264 may not be directly applicable for upcoming
                 high-definition video standards (such as High
                 Efficiency Video Coding (HEVC)) due to different
                 compression architecture. In this article, a compressed
                 domain watermarking scheme is proposed for H.265/HEVC
                 bit stream that can handle drift error propagation both
                 for intra- and interprediction process. Additionally,
                 the proposed scheme shows adequate robustness against
                 recompression attack as well as common image processing
                 attacks while maintaining decent visual quality. A
                 comprehensive set of experiments has been carried out
                 to justify the efficacy of the proposed scheme over the
                 existing literature.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dutta:2017:EFC,
  author =       "Tanima Dutta and Hari Prabhat Gupta",
  title =        "An Efficient Framework for Compressed Domain
                 Watermarking in {$P$} Frames of High-Efficiency Video
                 Coding ({HEVC})-Encoded Video",
  journal =      j-TOMM,
  volume =       "13",
  number =       "1",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3002178",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Jan 18 17:18:28 MST 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Digital watermarking has received much attention in
                 recent years as a promising solution to copyright
                 protection. Video watermarking in compressed domain has
                 gained importance since videos are stored and
                 transmitted in a compressed format. This decreases the
                 overhead to fully decode and re-encode the video for
                 embedding and extraction of the watermark. High
                 Efficiency Video Coding (HEVC/H.265) is the latest and
                 most efficient video compression standard and a
                 successor to H.264 Advanced Video Coding. In this
                 article, we propose a robust watermarking framework for
                 HEVC-encoded video using informed detector. A readable
                 watermark is embedded invisibly in P frames for better
                 perceptual quality. Our framework imposes security and
                 robustness by selecting appropriate blocks using a
                 random key and the spatio-temporal characteristics of
                 the compressed video. A detail analysis of the
                 strengths of different compressed domain features is
                 performed for implementing the watermarking framework.
                 We experimentally demonstrate the utility of the
                 proposed work. The results show that the proposed work
                 effectively limits the increase in video bitrate and
                 degradation in perceptual quality. The proposed
                 framework is robust against re-encoding and image
                 processing attacks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lisanti:2017:MKC,
  author =       "Giuseppe Lisanti and Svebor Karaman and Iacopo Masi",
  title =        "Multichannel-Kernel Canonical Correlation Analysis for
                 Cross-View Person Reidentification",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3038916",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we introduce a method to overcome one
                 of the main challenges of person reidentification in
                 multicamera networks, namely cross-view appearance
                 changes. The proposed solution addresses the extreme
                 variability of person appearance in different camera
                 views by exploiting multiple feature representations.
                 For each feature, kernel canonical correlation analysis
                 with different kernels is employed to learn several
                 projection spaces in which the appearance correlation
                 between samples of the same person observed from
                 different cameras is maximized. An iterative logistic
                 regression is finally used to select and weight the
                 contributions of each projection and perform the
                 matching between the two views. Experimental evaluation
                 shows that the proposed solution obtains comparable
                 performance on the VIPeR and PRID 450s datasets and
                 improves on the PRID and CUHK01 datasets with respect
                 to the state of the art.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ye:2017:TOM,
  author =       "Jun Ye and Hao Hu and Guo-Jun Qi and Kien A. Hua",
  title =        "A Temporal Order Modeling Approach to Human Action
                 Recognition from Multimodal Sensor Data",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3038917",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "From wearable devices to depth cameras, researchers
                 have exploited various multimodal data to recognize
                 human actions for applications, such as video gaming,
                 education, and healthcare. Although there many
                 successful techniques have been presented in the
                 literature, most current approaches have focused on
                 statistical or local spatiotemporal features and do not
                 explicitly explore the temporal dynamics of the sensor
                 data. However, human action data contain rich temporal
                 structure information that can characterize the unique
                 underlying patterns of different action categories.
                 From this perspective, we propose a novel temporal
                 order modeling approach to human action recognition.
                 Specifically, we explore subspace projections to
                 extract the latent temporal patterns from different
                 human action sequences. The temporal order between
                 these patterns are compared, and the index of the
                 pattern that appears first is used to encode the entire
                 sequence. This process is repeated multiple times and
                 produces a compact feature vector representing the
                 temporal dynamics of the sequence. Human action
                 recognition can then be efficiently solved by the
                 nearest neighbor search based on the Hamming distance
                 between these compact feature vectors. We further
                 introduce a sequential optimization algorithm to learn
                 the optimized projections that preserve the pairwise
                 label similarity of the action sequences. Experimental
                 results on two public human action datasets demonstrate
                 the superior performance of the proposed technique in
                 both accuracy and efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2017:MCL,
  author =       "Shuai Wang and Yang Cong and Huijie Fan and Baojie Fan
                 and Lianqing Liu and Yunsheng Yang and Yandong Tang and
                 Huaici Zhao and Haibin Yu",
  title =        "Multi-Class Latent Concept Pooling for Computer-Aided
                 Endoscopy Diagnosis",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3051481",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Successful computer-aided diagnosis systems typically
                 rely on training datasets containing sufficient and
                 richly annotated images. However, detailed image
                 annotation is often time consuming and subjective,
                 especially for medical images, which becomes the
                 bottleneck for the collection of large datasets and
                 then building computer-aided diagnosis systems. In this
                 article, we design a novel computer-aided endoscopy
                 diagnosis system to deal with the multi-classification
                 problem of electronic endoscopy medical records (EEMRs)
                 containing sets of frames, while labels of EEMRs can be
                 mined from the corresponding text records using an
                 automatic text-matching strategy without human special
                 labeling. With unambiguous EEMR labels and ambiguous
                 frame labels, we propose a simple but effective pooling
                 scheme called Multi-class Latent Concept Pooling, which
                 learns a codebook from EEMRs with different classes
                 step by step and encodes EEMRs based on a soft
                 weighting strategy. In our method, a computer-aided
                 diagnosis system can be extended to new unseen classes
                 with ease and applied to the standard single-instance
                 classification problem even though detailed annotated
                 images are unavailable. In order to validate our
                 system, we collect 1,889 EEMRs with more than 59K
                 frames and successfully mine labels for 348 of them.
                 The experimental results show that our proposed system
                 significantly outperforms the state-of-the-art methods.
                 Moreover, we apply the learned latent concept codebook
                 to detect the abnormalities in endoscopy images and
                 compare it with a supervised learning classifier, and
                 the evaluation shows that our codebook learning method
                 can effectively extract the true prototypes related to
                 different classes from the ambiguous data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Demirbilek:2017:MLB,
  author =       "Edip Demirbilek and Jean-Charles Gr{\'e}goire",
  title =        "Machine Learning-Based Parametric Audiovisual Quality
                 Prediction Models for Real-Time Communications",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3051482",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In order to mechanically predict audiovisual quality
                 in interactive multimedia services, we have developed
                 machine learning--based no-reference parametric models.
                 We have compared Decision Trees--based ensemble
                 methods, Genetic Programming and Deep Learning models
                 that have one and more hidden layers. We have used the
                 Institut national de la recherche scientifique (INRS)
                 audiovisual quality dataset specifically designed to
                 include ranges of parameters and degradations typically
                 seen in real-time communications. Decision Trees--based
                 ensemble methods have outperformed both Deep Learning--
                 and Genetic Programming--based models in terms of
                 Root-Mean-Square Error (RMSE) and Pearson correlation
                 values. We have also trained and developed models on
                 various publicly available datasets and have compared
                 our results with those of these original models. Our
                 studies show that Random Forests--based prediction
                 models achieve high accuracy for both the INRS
                 audiovisual quality dataset and other publicly
                 available comparable datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gokhale:2017:CCN,
  author =       "Vineet Gokhale and Jayakrishnan Nair and Subhasis
                 Chaudhuri",
  title =        "Congestion Control for Network-Aware Telehaptic
                 Communication",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3052821",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Telehaptic applications involve delay-sensitive
                 multimedia communication between remote locations with
                 distinct Quality of Service (QoS) requirements for
                 different media components. These QoS constraints pose
                 a variety of challenges, especially when the
                 communication occurs over a shared network, with
                 unknown and time-varying cross-traffic. In this work,
                 we propose a transport layer congestion control
                 protocol for telehaptic applications operating over
                 shared networks, termed as Dynamic Packetization Module
                 (DPM). DPM is a lossless, network-aware protocol that
                 tunes the telehaptic packetization rate based on the
                 level of congestion in the network. To monitor the
                 network congestion, we devise a novel network feedback
                 module, which communicates the end-to-end delays
                 encountered by the telehaptic packets to the respective
                 transmitters with negligible overhead. Via extensive
                 simulations, we show that DPM meets the QoS
                 requirements of telehaptic applications over a wide
                 range of network cross-traffic conditions. We also
                 report qualitative results of a real-time telepottery
                 experiment with several human subjects, which reveal
                 that DPM preserves the quality of telehaptic activity
                 even under heavily congested network scenarios.
                 Finally, we compare the performance of DPM with several
                 previously proposed telehaptic communication protocols
                 and demonstrate that DPM outperforms these protocols.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sobhani:2017:VBA,
  author =       "Ashkan Sobhani and Abdulsalam Yassine and Shervin
                 Shirmohammadi",
  title =        "A Video Bitrate Adaptation and Prediction Mechanism
                 for {HTTP} Adaptive Streaming",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3052822",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The Hypertext Transfer Protocol (HTTP) Adaptive
                 Streaming (HAS) has now become ubiquitous and accounts
                 for a large amount of video delivery over the Internet.
                 But since the Internet is prone to bandwidth
                 variations, HAS's up and down switching between
                 different video bitrates to keep up with bandwidth
                 variations leads to a reduction in Quality of
                 Experience (QoE). In this article, we propose a video
                 bitrate adaptation and prediction mechanism based on
                 Fuzzy logic for HAS players, which takes into
                 consideration the estimate of available network
                 bandwidth as well as the predicted buffer occupancy
                 level in order to proactively and intelligently respond
                 to current conditions. This leads to two contributions:
                 First, it allows HAS players to take appropriate
                 actions, sooner than existing methods, to prevent
                 playback interruptions caused by buffer underrun,
                 reducing the ON-OFF traffic phenomena associated with
                 current approaches and increasing the QoE. Second, it
                 facilitates fair sharing of bandwidth among competing
                 players at the bottleneck link. We present the
                 implementation of our proposed mechanism and provide
                 both empirical/QoE analysis and performance comparison
                 with existing work. Our results show that, compared to
                 existing systems, our system has (1) better fairness
                 among multiple competing players by almost 50\% on
                 average and as much as 80\% as indicated by Jain's
                 fairness index and (2) better perceived quality of
                 video by almost 8\% on average and as much as 17\%,
                 according to the estimate the Mean Opinion Score (eMOS)
                 model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Grant:2017:CSU,
  author =       "Jason M. Grant and Patrick J. Flynn",
  title =        "Crowd Scene Understanding from Video: a Survey",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3052930",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Crowd video analysis has applications in crowd
                 management, public space design, and visual
                 surveillance. Example tasks potentially aided by
                 automated analysis include anomaly detection (such as a
                 person walking against the grain of traffic or rapid
                 assembly/dispersion of groups of people), population
                 and density measurements, and interactions between
                 groups of people. This survey explores crowd analysis
                 as it relates to two primary research areas: crowd
                 statistics and behavior understanding. First, we survey
                 methods for counting individuals and approximating the
                 density of the crowd. Second, we showcase research
                 efforts on behavior understanding as related to crowds.
                 These works focus on identifying groups, interactions
                 within small groups, and abnormal activity detection
                 such as riots and bottlenecks in large crowds. Works
                 presented in this section also focus on tracking groups
                 of individuals, either as a single entity or a subset
                 of individuals within the frame of reference. Finally,
                 a summary of datasets available for crowd activity
                 video research is provided.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hussein:2017:VJF,
  author =       "Fairouz Hussein and Massimo Piccardi",
  title =        "{V-JAUNE}: a Framework for Joint Action Recognition
                 and Video Summarization",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3063532",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video summarization and action recognition are two
                 important areas of multimedia video analysis. While
                 these two areas have been tackled separately to date,
                 in this article, we present a latent structural SVM
                 framework to recognize the action and derive the
                 summary of a video in a joint, simultaneous fashion.
                 Efficient inference is provided by a submodular score
                 function that accounts for the action and summary
                 jointly. In this article, we also define a novel
                 measure to evaluate the quality of a predicted video
                 summary against the annotations of multiple annotators.
                 Quantitative and qualitative results over two
                 challenging action datasets-the ACE and MSR
                 DailyActivity3D datasets-show that the proposed joint
                 approach leads to higher action recognition accuracy
                 and equivalent or better summary quality than
                 comparable approaches that perform these tasks
                 separately.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cizmeci:2017:MSM,
  author =       "Burak Cizmeci and Xiao Xu and Rahul Chaudhari and
                 Christoph Bachhuber and Nicolas Alt and Eckehard
                 Steinbach",
  title =        "A Multiplexing Scheme for Multimodal Teleoperation",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "21:1--21:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3063594",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article proposes an application-layer
                 multiplexing scheme for teleoperation systems with
                 multimodal feedback (video, audio, and haptics). The
                 available transmission resources are carefully
                 allocated to avoid delay-jitter for the haptic signal
                 potentially caused by the size and arrival time of the
                 video and audio data. The multiplexing scheme gives
                 high priority to the haptic signal and applies a
                 preemptive-resume scheduling strategy to stream the
                 audio and video data. The proposed approach estimates
                 the available transmission rate in real time and adapts
                 the video bitrate, data throughput, and force buffer
                 size accordingly. Furthermore, the proposed scheme
                 detects sudden transmission rate drops and applies
                 congestion control to avoid abrupt delay increases and
                 converge promptly to the altered transmission rate. The
                 performance of the proposed scheme is measured
                 objectively in terms of end-to-end signal latencies,
                 packet rates, and peak signal-to-noise ratio (PSNR) for
                 visual quality. Moreover, peak-delay and convergence
                 time measurements are carried out to investigate the
                 performance of the congestion control mode of the
                 system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Su:2017:DDP,
  author =       "Zhuo Su and Kun Zeng and Hanhui Li and Xiaonan Luo",
  title =        "A Dual-Domain Perceptual Framework for Generating
                 Visual Inconspicuous Counterparts",
  journal =      j-TOMM,
  volume =       "13",
  number =       "2",
  pages =        "22:1--22:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3068427",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jun 16 14:48:38 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tomccap/;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For a given image, it is a challenging task to
                 generate its corresponding counterpart with visual
                 inconspicuous modification. The complexity of this
                 problem reasons from the high correlativity between the
                 editing operations and vision perception. Essentially,
                 a significant requirement that should be emphasized is
                 how to make the object modifications hard to be found
                 visually in the generative counterparts. In this
                 article, we propose a novel dual-domain perceptual
                 framework to generate visual inconspicuous
                 counterparts, which applies the perceptual
                 bidirectional similarity metric (PBSM) and appearance
                 similarity metric (ASM) to create the dual-domain
                 perception error minimization model. The candidate
                 targets are yielded by the well-known PatchMatch model
                 with the strokes-based interactions and selective
                 object library. By the dual-perceptual evaluation
                 index, all candidate targets are sorted to select out
                 the best result. For demonstration, a series of
                 objective and subjective measurements are used to
                 evaluate the performance of our framework.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Singh:2017:SCB,
  author =       "Priyanka Singh and Balasubramanian Raman and Nishant
                 Agarwal and Pradeep K. Atrey",
  title =        "Secure Cloud-Based Image Tampering Detection and
                 Localization Using {POB} Number System",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3077140",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The benefits of high-end computation infrastructure
                 facilities provided by cloud-based multimedia systems
                 are attracting people all around the globe. However,
                 such cloud-based systems possess security issues as
                 third party servers become involved in them. Rendering
                 data in an unreadable form so that no information is
                 revealed to the cloud data centers will serve as the
                 best solution to these security issues. One such image
                 encryption scheme based on a Permutation Ordered Binary
                 Number System has been proposed in this work. It
                 distributes the image information in totally random
                 shares, which can be stored at the cloud data centers.
                 Further, the proposed scheme authenticates the shares
                 at the pixel level. If any tampering is done at the
                 cloud servers, the scheme can accurately identify the
                 altered pixels via authentication bits and localizes
                 the tampered area. The tampered portion is also
                 reflected back in the reconstructed image that is
                 obtained at the authentic user end. The experimental
                 results validate the efficacy of the proposed scheme
                 against various kinds of possible attacks, tested with
                 a variety of images. The tamper detection accuracy has
                 been computed on a pixel basis and found to be
                 satisfactorily high for most of the tampering
                 scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Thirunarayanan:2017:CSE,
  author =       "Ishwarya Thirunarayanan and Khimya Khetarpal and
                 Sanjeev Koppal and Olivier {Le Meur} and John Shea and
                 Eakta Jain",
  title =        "Creating Segments and Effects on Comics by Clustering
                 Gaze Data",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3078836",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Traditional comics are increasingly being augmented
                 with digital effects, such as recoloring, stereoscopy,
                 and animation. An open question in this endeavor is
                 identifying where in a comic panel the effects should
                 be placed. We propose a fast, semi-automatic technique
                 to identify effects-worthy segments in a comic panel by
                 utilizing gaze locations as a proxy for the importance
                 of a region. We take advantage of the fact that comic
                 artists influence viewer gaze towards narrative
                 important regions. By capturing gaze locations from
                 multiple viewers, we can identify important regions and
                 direct a computer vision segmentation algorithm to
                 extract these segments. The challenge is that these
                 gaze data are noisy and difficult to process. Our key
                 contribution is to leverage a theoretical breakthrough
                 in the computer networks community towards robust and
                 meaningful clustering of gaze locations into semantic
                 regions, without needing the user to specify the number
                 of clusters. We present a method based on the concept
                 of relative eigen quality that takes a scanned comic
                 image and a set of gaze points and produces an image
                 segmentation. We demonstrate a variety of effects such
                 as defocus, recoloring, stereoscopy, and animations. We
                 also investigate the use of artificially generated gaze
                 locations from saliency models in place of actual gaze
                 locations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Houle:2017:QEC,
  author =       "Michael E. Houle and Xiguo Ma and Vincent Oria and
                 Jichao Sun",
  title =        "Query Expansion for Content-Based Similarity Search
                 Using Local and Global Features",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3063595",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article presents an efficient and totally
                 unsupervised content-based similarity search method for
                 multimedia data objects represented by high-dimensional
                 feature vectors. The assumption is that the similarity
                 measure is applicable to feature vectors of arbitrary
                 length. During the offline process, different sets of
                 features are selected by a generalized version of the
                 Laplacian Score in an unsupervised way for individual
                 data objects in the database. Online retrieval is
                 performed by ranking the query object in the feature
                 spaces of candidate objects. Those candidates for which
                 the query object is ranked highly are selected as the
                 query results. The ranking scheme is incorporated into
                 an automated query expansion framework to further
                 improve the semantic quality of the search result.
                 Extensive experiments were conducted on several
                 datasets to show the capability of the proposed method
                 in boosting effectiveness without losing efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Riegler:2017:ACA,
  author =       "Michael Riegler and Konstantin Pogorelov and Sigrun
                 Losada Eskeland and Peter Thelin Schmidt and Zeno
                 Albisser and Dag Johansen and Carsten Griwodz and
                 P{\aa}l Halvorsen and Thomas {De Lange}",
  title =        "From Annotation to Computer-Aided Diagnosis: Detailed
                 Evaluation of a Medical Multimedia System",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3079765",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Holistic medical multimedia systems covering
                 end-to-end functionality from data collection to aided
                 diagnosis are highly needed, but rare. In many
                 hospitals, the potential value of multimedia data
                 collected through routine examinations is not
                 recognized. Moreover, the availability of the data is
                 limited, as the health care personnel may not have
                 direct access to stored data. However, medical
                 specialists interact with multimedia content daily
                 through their everyday work and have an increasing
                 interest in finding ways to use it to facilitate their
                 work processes. In this article, we present a novel,
                 holistic multimedia system aiming to tackle automatic
                 analysis of video from gastrointestinal (GI) endoscopy.
                 The proposed system comprises the whole pipeline,
                 including data collection, processing, analysis, and
                 visualization. It combines filters using machine
                 learning, image recognition, and extraction of global
                 and local image features. The novelty is primarily in
                 this holistic approach and its real-time performance,
                 where we automate a complete algorithmic GI screening
                 process. We built the system in a modular way to make
                 it easily extendable to analyze various abnormalities,
                 and we made it efficient in order to run in real time.
                 The conducted experimental evaluation proves that the
                 detection and localization accuracy are comparable or
                 even better than existing systems, but it is by far
                 leading in terms of real-time performance and efficient
                 resource consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2017:EPR,
  author =       "Xun Yang and Meng Wang and Richang Hong and Qi Tian
                 and Yong Rui",
  title =        "Enhancing Person Re-identification in a Self-Trained
                 Subspace",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3089249",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Despite the promising progress made in recent years,
                 person re-identification (re-ID) remains a challenging
                 task due to the complex variations in human appearances
                 from different camera views. For this challenging
                 problem, a large variety of algorithms have been
                 developed in the fully supervised setting, requiring
                 access to a large amount of labeled training data.
                 However, the main bottleneck for fully supervised re-ID
                 is the limited availability of labeled training
                 samples. To address this problem, we propose a
                 self-trained subspace learning paradigm for person
                 re-ID that effectively utilizes both labeled and
                 unlabeled data to learn a discriminative subspace where
                 person images across disjoint camera views can be
                 easily matched. The proposed approach first constructs
                 pseudo-pairwise relationships among unlabeled persons
                 using the k-nearest neighbors algorithm. Then, with the
                 pseudo-pairwise relationships, the unlabeled samples
                 can be easily combined with the labeled samples to
                 learn a discriminative projection by solving an
                 eigenvalue problem. In addition, we refine the
                 pseudo-pairwise relationships iteratively, which
                 further improves learning performance. A multi-kernel
                 embedding strategy is also incorporated into the
                 proposed approach to cope with the non-linearity in a
                 person's appearance and explore the complementation of
                 multiple kernels. In this way, the performance of
                 person re-ID can be greatly enhanced when training data
                 are insufficient. Experimental results on six widely
                 used datasets demonstrate the effectiveness of our
                 approach, and its performance can be comparable to the
                 reported results of most state-of-the-art fully
                 supervised methods while using much fewer labeled
                 data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2017:RHA,
  author =       "Shih-Yao Lin and Yen-Yu Lin and Chu-Song Chen and
                 Yi-Ping Hung",
  title =        "Recognizing Human Actions with Outlier Frames by
                 Observation Filtering and Completion",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3089250",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article addresses the problem of recognizing
                 partially observed human actions. Videos of actions
                 acquired in the real world often contain corrupt frames
                 caused by various factors. These frames may appear
                 irregularly, and make the actions only partially
                 observed. They change the appearance of actions and
                 degrade the performance of pretrained recognition
                 systems. In this article, we propose an approach to
                 address the corrupt-frame problem without knowing their
                 locations and durations in advance. The proposed
                 approach includes two key components: outlier filtering
                 and observation completion. The former identifies and
                 filters out unobserved frames, and the latter fills up
                 the filtered parts by retrieving coherent alternatives
                 from training data. Hidden Conditional Random Fields
                 (HCRFs) are then used to recognize the filtered and
                 completed actions. Our approach has been evaluated on
                 three datasets, which contain both fully observed
                 actions and partially observed actions with either real
                 or synthetic corrupt frames. The experimental results
                 show that our approach performs favorably against the
                 other state-of-the-art methods, especially when corrupt
                 frames are present.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Karafotias:2017:IER,
  author =       "Georgios Karafotias and Akiko Teranishi and Georgios
                 Korres and Friederike Eyssel and Scandar Copti and
                 Mohamad Eid",
  title =        "Intensifying Emotional Reactions via Tactile Gestures
                 in Immersive Films",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092840",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The film industry continuously strives to make
                 visitors' movie experience more immersive and thus,
                 more captivating. This is realized through larger
                 screens, sophisticated speaker systems, and high
                 quality 2D and 3D content. Moreover, a recent trend in
                 the film industry is to incorporate multiple
                 interaction modalities, such as 4D film, to simulate
                 rain, wind, vibration, and heat, in order to intensify
                 viewers' emotional reactions. In this context, humans'
                 sense of touch possesses significant potential for
                 intensifying emotional reactions for the film
                 experience beyond audio-visual sensory modalities. This
                 article presents a framework for authoring tactile cues
                 (tactile gestures as used in this article) and enabling
                 automatic rendering of said gestures to intensify
                 emotional reactions in an immersive film experience. To
                 validate the proposed framework, we conducted an
                 experimental study where tactile gestures are designed
                 and evaluated for the ability to intensify four
                 emotional reactions: high valence-high arousal, high
                 valence-low arousal, low valence-high arousal, and low
                 valence-low arousal. Using a haptic jacket,
                 participants felt tactile gestures that are
                 synchronized with the audio-visual contents of a film.
                 Results demonstrated that (1) any tactile feedback
                 generated a positive user experience; (2) the tactile
                 feedback intensifies emotional reactions when the
                 audio-visual stimuli elicit clear emotional responses,
                 except for low arousal emotional response since tactile
                 gestures seem to always generate excitement; (3)
                 purposed tactile gestures do not seem to significantly
                 outperform randomized tactile gesture for intensifying
                 specific emotional reactions; and (4) using a haptic
                 jacket is not distracting for the users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheung:2017:ASU,
  author =       "Ming Cheung and James She",
  title =        "An Analytic System for User Gender Identification
                 through User Shared Images",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3095077",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Many social media applications, such as
                 recommendation, virality prediction, and marketing,
                 make use of user gender, which may not be explicitly
                 specified or kept privately. Meanwhile, advanced mobile
                 devices have become part of our lives and a huge amount
                 of content is being generated by users every day,
                 especially user shared images shared by individuals in
                 social networks. This particular form of user generated
                 content is widely accessible to others due to the
                 sharing nature. When user gender is only accessible to
                 exclusive parties, these user shared images are proved
                 to be an easier way to identify user gender. This work
                 investigated 3,152,344 images by 7,450 users from
                 Fotolog and Flickr, two image-oriented social networks.
                 It is observed that users who share visually similar
                 images are more likely to have the same gender. A
                 multimedia big data system that utilizes this
                 phenomenon is proposed for user gender identification
                 with 79\% accuracy. These findings are useful for
                 information or services in any social network with
                 intensive image sharing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Engelbrecht:2017:PDS,
  author =       "Herman A. Engelbrecht and John S. Gilmore",
  title =        "{Pithos}: Distributed Storage for Massive Multi-User
                 Virtual Environments",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "31:1--31:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3105577",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "There has been significant research effort into
                 peer-to-peer (P2P) massively multi-user virtual
                 environments (MMVEs). A number of architectures have
                 been proposed to implement the P2P approach; however,
                 the development of fully distributed MMVEs has met with
                 a number of challenges. In this work, we address one of
                 the key remaining challenges of state consistency and
                 persistency in P2P MMVEs. Having reviewed state
                 management and persistency architectures currently
                 receiving research attention, we have identified
                 deficiencies such as lack of load balancing,
                 responsiveness, and scalability. To address these
                 deficiencies, we present Pithos-a reliable, responsive,
                 secure, load-balanced, and scalable distributed storage
                 system, suited to P2P MMVEs. Pithos is designed
                 specifically for P2P MMVEs, and we show that it
                 improves the reliability and responsiveness of storage
                 architectures as compared to existing P2P state
                 persistency architectures. Pithos is implemented as an
                 OverSim simulation running on the OMNeT++ network
                 simulation framework. It is evaluated using up to
                 10,400 peers, with realistic latency profiles, with up
                 to 15.8 million storage and retrieval requests that are
                 generated to store a total of 2.4 million objects. Each
                 peer in Pithos uses a maximum of 1,950Bps bandwidth to
                 achieve 99.98\% storage reliability, while the most
                 reliable overlay storage configuration tested only
                 achieved 93.65\% reliability, using 2,182Bps bandwidth.
                 Pithos is also more responsive than overlay storage,
                 with an average responsiveness of 0.192s, compared with
                 the average overlay responsiveness of 1.4s when
                 retrieving objects from storage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2017:SDL,
  author =       "Jun Zhang and Meng Wang and Liang Lin and Xun Yang and
                 Jun Gao and Yong Rui",
  title =        "Saliency Detection on Light Field: a Multi-Cue
                 Approach",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3",
  pages =        "32:1--32:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3107956",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Saliency detection has recently received increasing
                 research interest on using high-dimensional datasets
                 beyond two-dimensional images. Despite the many
                 available capturing devices and algorithms, there still
                 exists a wide spectrum of challenges that need to be
                 addressed to achieve accurate saliency detection.
                 Inspired by the success of the light-field technique,
                 in this article, we propose a new computational scheme
                 to detect salient regions by integrating multiple
                 visual cues from light-field images. First, saliency
                 prior maps are generated from several light-field
                 features based on superpixel-level intra-cue
                 distinctiveness, such as color, depth, and flow
                 inherited from different focal planes and multiple
                 viewpoints. Then, we introduce the location prior to
                 enhance the saliency maps. These maps will finally be
                 merged into a single map using a random-search-based
                 weighting strategy. Besides, we refine the object
                 details by employing a two-stage saliency refinement to
                 obtain the final saliency map. In addition, we present
                 a more challenging benchmark dataset for light-field
                 saliency analysis, named HFUT-Lytro, which consists of
                 255 light fields with a range from 53 to 64 images
                 generated from each light-field image, therein spanning
                 multiple occurrences of saliency detection challenges
                 such as occlusions, cluttered background, and
                 appearance changes. Experimental results show that our
                 approach can achieve 0.6--6.7\% relative improvements
                 over state-of-the-art methods in terms of the F-measure
                 and Precision metrics, which demonstrates the
                 effectiveness of the proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ota:2017:ISI,
  author =       "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and
                 Francesco G. B. {De Natale}",
  title =        "Introduction to Special Issue on Deep Learning for
                 Mobile Multimedia",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "33:1--33:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3088340",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ota:2017:DLM,
  author =       "Kaoru Ota and Minh Son Dao and Vasileios Mezaris and
                 Francesco G. B. {De Natale}",
  title =        "Deep Learning for Mobile Multimedia: a Survey",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "34:1--34:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092831",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Deep Learning (DL) has become a crucial technology for
                 multimedia computing. It offers a powerful instrument
                 to automatically produce high-level abstractions of
                 complex multimedia data, which can be exploited in a
                 number of applications, including object detection and
                 recognition, speech-to- text, media retrieval,
                 multimodal data analysis, and so on. The availability
                 of affordable large-scale parallel processing
                 architectures, and the sharing of effective open-source
                 codes implementing the basic learning algorithms,
                 caused a rapid diffusion of DL methodologies, bringing
                 a number of new technologies and applications that
                 outperform, in most cases, traditional machine learning
                 technologies. In recent years, the possibility of
                 implementing DL technologies on mobile devices has
                 attracted significant attention. Thanks to this
                 technology, portable devices may become smart objects
                 capable of learning and acting. The path toward these
                 exciting future scenarios, however, entangles a number
                 of important research challenges. DL architectures and
                 algorithms are hardly adapted to the storage and
                 computation resources of a mobile device. Therefore,
                 there is a need for new generations of mobile
                 processors and chipsets, small footprint learning and
                 inference algorithms, new models of collaborative and
                 distributed processing, and a number of other
                 fundamental building blocks. This survey reports the
                 state of the art in this exciting research area,
                 looking back to the evolution of neural networks, and
                 arriving to the most recent results in terms of
                 methodologies, technologies, and applications for
                 mobile environments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Seidenari:2017:DAD,
  author =       "Lorenzo Seidenari and Claudio Baecchi and Tiberio
                 Uricchio and Andrea Ferracani and Marco Bertini and
                 Alberto {Del Bimbo}",
  title =        "Deep Artwork Detection and Retrieval for Automatic
                 Context-Aware Audio Guides",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "35:1--35:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092832",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we address the problem of creating a
                 smart audio guide that adapts to the actions and
                 interests of museum visitors. As an autonomous agent,
                 our guide perceives the context and is able to interact
                 with users in an appropriate fashion. To do so, it
                 understands what the visitor is looking at, if the
                 visitor is moving inside the museum hall, or if he or
                 she is talking with a friend. The guide performs
                 automatic recognition of artworks, and it provides
                 configurable interface features to improve the user
                 experience and the fruition of multimedia materials
                 through semi-automatic interaction. Our smart audio
                 guide is backed by a computer vision system capable of
                 working in real time on a mobile device, coupled with
                 audio and motion sensors. We propose the use of a
                 compact Convolutional Neural Network (CNN) that
                 performs object classification and localization. Using
                 the same CNN features computed for these tasks, we
                 perform also robust artwork recognition. To improve the
                 recognition accuracy, we perform additional video
                 processing using shape-based filtering, artwork
                 tracking, and temporal filtering. The system has been
                 deployed on an NVIDIA Jetson TK1 and a NVIDIA Shield
                 Tablet K1 and tested in a real-world environment
                 (Bargello Museum of Florence).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pouladzadeh:2017:MMF,
  author =       "Parisa Pouladzadeh and Shervin Shirmohammadi",
  title =        "Mobile Multi-Food Recognition Using Deep Learning",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "36:1--36:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3063592",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we propose a mobile food recognition
                 system that uses the picture of the food, taken by the
                 user's mobile device, to recognize multiple food items
                 in the same meal, such as steak and potatoes on the
                 same plate, to estimate the calorie and nutrition of
                 the meal. To speed up and make the process more
                 accurate, the user is asked to quickly identify the
                 general area of the food by drawing a bounding circle
                 on the food picture by touching the screen. The system
                 then uses image processing and computational
                 intelligence for food item recognition. The advantage
                 of recognizing items, instead of the whole meal, is
                 that the system can be trained with only single item
                 food images. At the training stage, we first use region
                 proposal algorithms to generate candidate regions and
                 extract the convolutional neural network (CNN) features
                 of all regions. Second, we perform region mining to
                 select positive regions for each food category using
                 maximum cover by our proposed submodular optimization
                 method. At the testing stage, we first generate a set
                 of candidate regions. For each region, a classification
                 score is computed based on its extracted CNN features
                 and predicted food names of the selected regions. Since
                 fast response is one of the important parameters for
                 the user who wants to eat the meal, certain heavy
                 computational parts of the application are offloaded to
                 the cloud. Hence, the processes of food recognition and
                 calorie estimation are performed in cloud server. Our
                 experiments, conducted with the FooDD dataset, show an
                 average recall rate of 90.98\%, precision rate of
                 93.05\%, and accuracy of 94.11\% compared to 50.8\% to
                 88\% accuracy of other existing food recognition
                 systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bharati:2017:ETC,
  author =       "Sailesh Bharati and Hassan Aboubakr Omar and Weihua
                 Zhuang",
  title =        "Enhancing Transmission Collision Detection for
                 Distributed {TDMA} in Vehicular Networks",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "37:1--37:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092833",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The increasing number of road accidents has led to the
                 evolution of vehicular ad hoc networks (VANETs), which
                 allow vehicles and roadside infrastructure to
                 continuously broadcast safety messages, including
                 necessary information to avoid undesired events on the
                 road. To support reliable broadcast of safety messages,
                 distributed time division multiple access (D-TDMA)
                 protocols are proposed for medium access control in
                 VANETs. Existing D-TDMA protocols react to a
                 transmission failure without distinguishing whether the
                 failure comes from a transmission collision or from a
                 poor radio channel condition, resulting in degraded
                 performance. In this article, we present the importance
                 of transmission failure differentiation due to a poor
                 channel or due to a transmission collision for D-TDMA
                 protocols in vehicular networks. We study the effects
                 of such a transmission failure differentiation on the
                 performance of a node when reserving a time slot to
                 access the transmission channel. Furthermore, we
                 propose a method for transmission failure
                 differentiation, employing the concept of deep-learning
                 techniques, for a node to decide whether to release or
                 continue using its acquired time slot. The proposed
                 method is based on the application of a Markov chain
                 model to estimate the channel state when a transmission
                 failure occurs. The Markov model parameters are
                 dynamically updated by each node (i.e., vehicle or
                 roadside unit) based on information included in the
                 safety messages that are periodically received from
                 neighboring nodes. In addition, from the D-TDMA
                 protocol headers of received messages, a node
                 approximately determines the error in estimating the
                 channel state based on the proposed Markov model and
                 then uses this channel estimation error to further
                 improve subsequent channel state estimations. Through
                 mathematical analysis, we show that transmission
                 failure differentiation, or transmission collision
                 detection, helps a node to efficiently reserve a time
                 slot even with a large number of nodes contending for
                 time slots. Furthermore, through extensive simulations
                 in a highway scenario, we demonstrate that the proposed
                 solution significantly improves the performance of
                 D-TDMA protocols by reducing unnecessary contention on
                 the available time slots, thus increasing the number of
                 nodes having unique time slots for successful broadcast
                 of safety messages.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Vandecasteele:2017:SSC,
  author =       "Florian Vandecasteele and Karel Vandenbroucke and
                 Dimitri Schuurman and Steven Verstockt",
  title =        "{Spott}: On-the-Spot e-Commerce for Television Using
                 Deep Learning-Based Video Analysis Techniques",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "38:1--38:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092834",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Spott is an innovative second screen mobile multimedia
                 application which offers viewers relevant information
                 on objects (e.g., clothing, furniture, food) they see
                 and like on their television screens. The application
                 enables interaction between TV audiences and brands, so
                 producers and advertisers can offer potential consumers
                 tailored promotions, e-shop items, and/or free samples.
                 In line with the current views on innovation
                 management, the technological excellence of the Spott
                 application is coupled with iterative user involvement
                 throughout the entire development process. This article
                 discusses both of these aspects and how they impact
                 each other. First, we focus on the technological
                 building blocks that facilitate the (semi-) automatic
                 interactive tagging process of objects in the video
                 streams. The majority of these building blocks
                 extensively make use of novel and state-of-the-art deep
                 learning concepts and methodologies. We show how these
                 deep learning based video analysis techniques
                 facilitate video summarization, semantic keyframe
                 clustering, and (similar) object retrieval. Secondly,
                 we provide insights in user tests that have been
                 performed to evaluate and optimize the application's
                 user experience. The lessons learned from these open
                 field tests have already been an essential input in the
                 technology development and will further shape the
                 future modifications to the Spott application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2017:TDC,
  author =       "Qingchen Zhang and Laurence T. Yang and Xingang Liu
                 and Zhikui Chen and Peng Li",
  title =        "A {Tucker} Deep Computation Model for Mobile
                 Multimedia Feature Learning",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "39:1--39:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3063593",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Recently, the deep computation model, as a tensor deep
                 learning model, has achieved super performance for
                 multimedia feature learning. However, the conventional
                 deep computation model involves a large number of
                 parameters. Typically, training a deep computation
                 model with millions of parameters needs
                 high-performance servers with large-scale memory and
                 powerful computing units, limiting the growth of the
                 model size for multimedia feature learning on common
                 devices such as portable CPUs and conventional
                 desktops. To tackle this problem, this article proposes
                 a Tucker deep computation model by using the Tucker
                 decomposition to compress the weight tensors in the
                 full-connected layers for multimedia feature learning.
                 Furthermore, a learning algorithm based on the
                 back-propagation strategy is devised to train the
                 parameters of the Tucker deep computation model.
                 Finally, the performance of the Tucker deep computation
                 model is evaluated by comparing with the conventional
                 deep computation model on two representative multimedia
                 datasets, that is, CUAVE and SNAE2, in terms of
                 accuracy drop, parameter reduction, and speedup in the
                 experiments. Results imply that the Tucker deep
                 computation model can achieve a large-parameter
                 reduction and speedup with a small accuracy drop for
                 multimedia feature learning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Timmerer:2017:BPA,
  author =       "Christian Timmerer and Ali C. Begen",
  title =        "Best Papers of the {2016 ACM Multimedia Systems
                 (MMSys) Conference and Workshop on Network and
                 Operating System Support for Digital Audio and Video
                 (NOSSDAV) 2016}",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "40:1--40:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3084539",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Daronco:2017:DRA,
  author =       "Stefano D'aronco and Sergio Mena and Pascal Frossard",
  title =        "Distributed Rate Allocation in Switch-Based Multiparty
                 Videoconferencing System",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "41:1--41:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092835",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Multiparty videoconferences, or more generally
                 multiparty video calls, are gaining a lot of popularity
                 as they offer a rich communication experience. These
                 applications have, however, large requirements in terms
                 of both network and computational resources and have to
                 deal with sets of heterogeneous clients. The multiparty
                 videoconferencing systems are usually either based on
                 expensive central nodes, called Multipoint Control
                 Units (MCU), with transcoding capabilities, or on a
                 peer-to-peer architecture where users cooperate to
                 distribute more efficiently the different video
                 streams. Whereas the first class of systems requires an
                 expensive central hardware, the second one depends
                 completely on the redistribution capacity of the users,
                 which sometimes might neither provide sufficient
                 bandwidth nor be reliable enough. In this work, we
                 propose an alternative solution where we use a central
                 node to distribute the video streams, but at the same
                 time we maintain the hardware complexity and the
                 computational requirements of this node as low as
                 possible, for example, it has no video decoding
                 capabilities. We formulate the rate allocation problem
                 as an optimization problem that aims at maximizing the
                 Quality of Service (QoS) of the videoconference. We
                 propose two different distributed algorithms for
                 solving the optimization problem: the first algorithm
                 is able to find an approximate solution of the problem
                 in a one-shot execution, whereas the second algorithm,
                 based on Lagrangian relaxation, performs iterative
                 updates of the optimization variables in order to
                 gradually increase the value of the objective function.
                 The two algorithms, though being disjointed, nicely
                 complement each other. If executed in sequence, they
                 allow us to achieve both a quick approximate rate
                 reallocation, in case of a sudden change of the system
                 conditions, and a precise refinement of the variables,
                 which avoids problems caused by possible faulty
                 approximate solutions. We have further implemented our
                 solution in a network simulator where we show that our
                 rate allocation algorithm is able to properly optimize
                 users' QoS. We also illustrate the benefits of our
                 solution in terms of network usage and overall utility
                 when compared to a baseline heuristic method operating
                 on the same system architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cofano:2017:DPE,
  author =       "Giuseppe Cofano and Luca {De Cicco} and Thomas Zinner
                 and Anh Nguyen-Ngoc and Phuoc Tran-Gia and Saverio
                 Mascolo",
  title =        "Design and Performance Evaluation of Network-assisted
                 Control Strategies for {HTTP} Adaptive Streaming",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "42:1--42:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092836",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article investigates several network-assisted
                 streaming approaches that rely on active cooperation
                 between video streaming applications and the network.
                 We build a Video Control Plane that enforces Video
                 Quality Fairness among concurrent video flows generated
                 by heterogeneous client devices. For this purpose, a
                 max-min fairness optimization problem is solved at
                 runtime. We compare two approaches to actuate the
                 optimal solution in an Software Defined Networking
                 network: The first one allocates network bandwidth
                 slices to video flows, and the second one guides video
                 players in the video bitrate selection. We assess
                 performance through several QoE-related metrics, such
                 as Video Quality Fairness, video quality, and switching
                 frequency. The impact of client-side adaptation
                 algorithms is also investigated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wisniewski:2017:OAA,
  author =       "Piotr Wisniewski and Jordi Mongay Batalla and Andrzej
                 Beben and Piotr Krawiec and Andrzej Chydzinski",
  title =        "On Optimizing Adaptive Algorithms Based on Rebuffering
                 Probability",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "43:1--43:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092837",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Traditionally, video adaptive algorithms aim to select
                 the representation that better fits to the current
                 download rate. In recent years, a number of new
                 approaches appeared that take into account the buffer
                 occupancy and the probability of video rebuffering as
                 important indicators of the representation to be
                 selected. We propose an optimization of the existing
                 algorithm based on rebuffering probability and argue
                 that the algorithm should avoid the situations when the
                 client buffer is full and the download is stopped,
                 since these situations decrease the efficiency of the
                 algorithm. Reducing full buffer states does not
                 increase the rebuffering probability thanks to a clever
                 management of the client buffer, which analyses the
                 buffer occupancy and downloads higher bitrate
                 representations only in the case of high buffer
                 occupancy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kleinrouweler:2017:SAP,
  author =       "Jan Willem Kleinrouweler and Sergio Cabrero and Pablo
                 Cesar",
  title =        "An {SDN} Architecture for Privacy-Friendly
                 Network-Assisted {DASH}",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "44:1--44:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092838",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Dynamic Adaptive Streaming over HTTP (DASH) is the
                 premier technology for Internet video streaming. DASH
                 efficiently uses existing HTTP-based delivery
                 infrastructures implementing adaptive streaming.
                 However, DASH traffic is bursty in nature. This causes
                 performance problems when DASH players share a network
                 connection or in networks with heavy background
                 traffic. The result is unstable and lower quality
                 video. In this article, we present the design and
                 implementation of a so-called DASH Assisting Network
                 Element (DANE). Our system provides target bitrate
                 signaling and dynamic traffic control. These two
                 mechanisms realize proper bandwidth sharing among
                 clients. Our system is privacy friendly and fully
                 supports encrypted video streams. Trying to improve the
                 streaming experience for users who share a network
                 connection, our system increases the video bitrate and
                 reduces the number of quality switches. We show this
                 through evaluations in our Wi-Fi testbed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2017:DAQ,
  author =       "Cong Wang and Divyashri Bhat and Amr Rizk and Michael
                 Zink",
  title =        "Design and Analysis of {QoE}-Aware Quality Adaptation
                 for {DASH}: a Spectrum-Based Approach",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "45:1--45:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092839",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The dynamics of the application-layer-based control
                 loop of dynamic adaptive streaming over HTTP (DASH)
                 make video bitrate selection for DASH a difficult
                 problem. In this work, we provide a DASH quality
                 adaptation algorithm, named SQUAD, that is specifically
                 tailored to provide a high quality of experience (QoE).
                 We review and provide new insights into the challenges
                 for DASH rate estimation. We found that in addition to
                 the ON-OFF behavior of DASH clients, there exists a
                 discrepancy in the timescales that form the basis of
                 the rate estimates across (i) different video segments
                 and (ii) the rate control loops of DASH and
                 Transmission Control Protocol (TCP). With these
                 observations in mind, we design SQUAD aiming to
                 maximize the average quality bitrate while minimizing
                 the quality variations. We test our implementation of
                 SQUAD together with a number of different quality
                 adaptation algorithms under various conditions in the
                 Global Environment for Networking Innovation testbed,
                 as well as, in a series of measurements over the public
                 Internet. Through a measurement study, we show that by
                 sacrificing little to nothing in average quality
                 bitrate, SQUAD can provide significantly better QoE in
                 terms of quality switching and magnitude. In addition,
                 we show that retransmission of higher-quality segments
                 that were originally received in low-quality is
                 feasible and improves the QoE.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2017:CAC,
  author =       "Cong Zhang and Jiangchuan Liu and Haiyang Wang",
  title =        "Cloud-Assisted Crowdsourced Livecast",
  journal =      j-TOMM,
  volume =       "13",
  number =       "3s",
  pages =        "46:1--46:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3095755",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:22 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The past two years have witnessed an explosion of a
                 new generation of livecast services, represented by
                 Twitch.tv, GamingLive, and Dailymotion, to name but a
                 few. With such a livecast service, geo-distributed
                 Internet users can broadcast any event in real-time,
                 for example, game, cooking, drawing, and so on, to
                 viewers of interest. Its crowdsourced nature enables
                 rich interactions among broadcasters and viewers but
                 also introduces great challenges to accommodate their
                 great scales and dynamics. To fulfill the demands from
                 a large number of heterogeneous broadcasters and
                 geo-distributed viewers, expensive server clusters have
                 been deployed to ingest and transcode live streams. Yet
                 our Twitch-based measurement shows that a significant
                 portion of the unpopular and dynamic broadcasters are
                 consuming considerable system resources; in particular,
                 25\% of bandwidth resources and 30\% of computational
                 capacity are used by the broadcasters who do not have
                 any viewers at all. In this article, through the
                 real-world measurement and data analysis, we show that
                 the public cloud has great potentials to address these
                 scalability challenges. We accordingly present the
                 design of Cloud-assisted Crowdsourced Livecast (CACL)
                 and propose a comprehensive set of solutions for
                 broadcaster partitioning. Our trace-driven evaluations
                 show that our CACL design can smartly assign ingesting
                 and transcoding tasks to the elastic cloud virtual
                 machines, providing flexible and cost-effective system
                 deployment.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dao:2017:TCM,
  author =       "Minh Son Dao",
  title =        "This is the Table of Contents for the most recent
                 online-only supplemental issue {TOMM} 13(3s). {Please}
                 find this supplemental issue in the {ACM Digital
                 Library} and enjoy reading them!",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "47:1--47:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3143786",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47e",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2017:SRB,
  author =       "Hong-Bo Zhang and Bineng Zhong and Qing Lei and
                 Ji-Xiang Du and Jialin Peng and Duansheng Chen and Xiao
                 Ke",
  title =        "Sparse Representation-Based Semi-Supervised Regression
                 for People Counting",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "47:1--47:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106156",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Label imbalance and the insufficiency of labeled
                 training samples are major obstacles in most methods
                 for counting people in images or videos. In this work,
                 a sparse representation-based semi-supervised
                 regression method is proposed to count people in images
                 with limited data. The basic idea is to predict the
                 unlabeled training data, select reliable samples to
                 expand the labeled training set, and retrain the
                 regression model. In the algorithm, the initial
                 regression model, which is learned from the labeled
                 training data, is used to predict the number of people
                 in the unlabeled training dataset. Then, the unlabeled
                 training samples are regarded as an over-complete
                 dictionary. Each feature of the labeled training data
                 can be expressed as a sparse linear approximation of
                 the unlabeled data. In turn, the labels of the labeled
                 training data can be estimated based on a sparse
                 reconstruction in feature space. The label confidence
                 in labeling an unlabeled sample is estimated by
                 calculating the reconstruction error. The training set
                 is updated by selecting unlabeled samples with minimal
                 reconstruction errors, and the regression model is
                 retrained on the new training set. A co-training style
                 method is applied during the training process. The
                 experimental results demonstrate that the proposed
                 method has a low mean square error and mean absolute
                 error compared with those of state-of-the-art
                 people-counting benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Akhtar:2017:COV,
  author =       "Shahid Akhtar and Andre Beck and Ivica Rimac",
  title =        "Caching Online Video: Analysis and Proposed
                 Algorithm",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "48:1--48:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106157",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Online video presents new challenges to traditional
                 caching with over a thousand-fold increase in number of
                 assets, rapidly changing popularity of assets and much
                 higher throughput requirements. We propose a new
                 hierarchical filtering algorithm for caching online
                 video HiFi. Our algorithm is designed to optimize hit
                 rate, replacement rate and cache throughput. It has an
                 associated implementation complexity comparable to that
                 of LRU. Our results show that, under typical operator
                 conditions, HiFi can increase edge cache byte hit rate
                 by 5\%--24\% over an LRU policy, but more importantly
                 can increase the RAM or memory byte hit rate by 80\% to
                 200\% and reduce the replacement rate by more than 100
                 times! These two factors combined can dramatically
                 increase throughput for most caches. If SSDs are used
                 for storage, the much lower replacement rate may also
                 allow substitution of lower-cost MLC-based SSDs instead
                 of SLC-based SSDs. We extend previous multi-tier
                 analytical models for LRU caches to caches with
                 filtering. We analytically show how HiFi can approach
                 the performance of an optimal caching policy and how to
                 tune HiFi to reach as close to optimal performance as
                 the traffic conditions allow. We develop a realistic
                 simulation environment for online video using
                 statistics from operator traces. We show that HiFi
                 performs within a few percentage points from the
                 optimal solution which was simulated by Belady's MIN
                 algorithm under typical operator conditions",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dang-Nguyen:2017:MRD,
  author =       "Duc-Tien Dang-Nguyen and Luca Piras and Giorgio
                 Giacinto and Giulia Boato and Francesco G. B. {De
                 Natale}",
  title =        "Multimodal Retrieval with Diversification and
                 Relevance Feedback for Tourist Attraction Images",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "49:1--49:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3103613",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we present a novel framework that can
                 produce a visual description of a tourist attraction by
                 choosing the most diverse pictures from
                 community-contributed datasets, which describe
                 different details of the queried location. The main
                 strength of the proposed approach is its flexibility
                 that permits us to filter out non-relevant images and
                 to obtain a reliable set of diverse and relevant images
                 by first clustering similar images according to their
                 textual descriptions and their visual content and then
                 extracting images from different clusters according to
                 a measure of the user's credibility. Clustering is
                 based on a two-step process, where textual descriptions
                 are used first and the clusters are then refined
                 according to the visual features. The degree of
                 diversification can be further increased by exploiting
                 users' judgments on the results produced by the
                 proposed algorithm through a novel approach, where
                 users not only provide a relevance feedback but also a
                 diversity feedback. Experimental results performed on
                 the MediaEval 2015 ``Retrieving Diverse Social Images''
                 dataset show that the proposed framework can achieve
                 very good performance both in the case of automatic
                 retrieval of diverse images and in the case of the
                 exploitation of the users' feedback. The effectiveness
                 of the proposed approach has been also confirmed by a
                 small case study involving a number of real users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{FujiiPontello:2017:MUR,
  author =       "Luciana {Fujii Pontello} and Pedro H. F. Holanda and
                 Bruno Guilherme and Jo{\~a}o Paulo V. Cardoso and Olga
                 Goussevskaia and Ana Paula {Couto Da Silva}",
  title =        "Mixtape: Using Real-Time User Feedback to Navigate
                 Large Media Collections",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "50:1--50:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3105969",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this work, we explore the increasing demand for
                 novel user interfaces to navigate large media
                 collections. We implement a geometric data structure to
                 store and retrieve item-to-item similarity information
                 and propose a novel navigation framework that uses
                 vector operations and real-time user feedback to direct
                 the outcome. The framework is scalable to large media
                 collections and is suitable for computationally
                 constrained devices. In particular, we implement this
                 framework in the domain of music. To evaluate the
                 effectiveness of the navigation process, we propose an
                 automatic evaluation framework, based on synthetic user
                 profiles, which allows us to quickly simulate and
                 compare navigation paths using different algorithms and
                 datasets. Moreover, we perform a real user study. To do
                 that, we developed and launched Mixtape, a simple web
                 application that allows users to create playlists by
                 providing real-time feedback through liking and
                 skipping patterns.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yakubu:2017:SSN,
  author =       "Abukari M. Yakubu and Namunu C. Maddage and Pradeep K.
                 Atrey",
  title =        "Securing Speech Noise Reduction in Outsourced
                 Environment",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "51:1--51:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3105970",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Cloud data centers (CDCs) are becoming a
                 cost-effective method for processing and storage of
                 multimedia data including images, video, and audio.
                 Since CDCs are physically located in different
                 jurisdictions, and are managed by external parties,
                 data security is a growing concern. Data encryption at
                 CDCs is commonly practiced to improve data security.
                 However, to process the data at CDCs, data must often
                 be decrypted, which raises issues in security. Thus,
                 there is a growing demand for data processing
                 techniques in encrypted domain in such an outsourced
                 environment. In this article, we analyze encrypted
                 domain speech content processing techniques for noise
                 reduction. Noise contaminates speech during
                 transmission or during the acquisition process by
                 recording. As a result, the quality of the speech
                 content is degraded. We apply Shamir's secret sharing
                 as the cryptosystem to encrypt speech data before
                 uploading it to a CDC. We then propose finite impulse
                 response digital filters to reduce white and wind noise
                 in the speech in the encrypted domain. We prove that
                 our proposed schemes meet the security requirements of
                 efficiency, accuracy, and checkability for both
                 semi-honest and malicious adversarial models.
                 Experimental results show that our proposed filtering
                 techniques for speech noise reduction in the encrypted
                 domain produce similar results when compared to
                 plaintext domain processing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guerrini:2017:IFR,
  author =       "Fabrizio Guerrini and Nicola Adami and Sergio Benini
                 and Alberto Piacenza and Julie Porteous and Marc
                 Cavazza and Riccardo Leonardi",
  title =        "Interactive Film Recombination",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "52:1--52:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3103241",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we discuss an innovative media
                 entertainment application called Interactive
                 Movietelling. As an offspring of Interactive
                 Storytelling applied to movies, we propose to integrate
                 narrative generation through artificial intelligence
                 (AI) planning with video processing and modeling to
                 construct filmic variants starting from the baseline
                 content. The integration is possible thanks to content
                 description using semantic attributes pertaining to
                 intermediate-level concepts shared between video
                 processing and planning levels. The output is a
                 recombination of segments taken from the input movie
                 performed so as to convey an alternative plot. User
                 tests on the prototype proved how promising Interactive
                 Movietelling might be, even if it was designed at a
                 proof of concept level. Possible improvements that are
                 suggested here lead to many challenging research
                 issues.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhou:2017:CCB,
  author =       "Mingliang Zhou and Yongfei Zhang and Bo Li and Xupeng
                 Lin",
  title =        "Complexity Correlation-Based {CTU}-Level Rate Control
                 with Direction Selection for {HEVC}",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "53:1--53:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3107616",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Rate control is a crucial consideration in
                 high-efficiency video coding (HEVC). The estimation of
                 model parameters is very important for coding tree unit
                 (CTU)-level rate control, as it will significantly
                 affect bit allocation and thus coding performance.
                 However, the model parameters in the CTU-level rate
                 control sometimes fails because of inadequate
                 consideration of the correlation between model
                 parameters and complexity characteristic. In this
                 study, we establish a novel complexity
                 correlation-based CTU-level rate control for HEVC.
                 First, we formulate the model parameter estimation
                 scheme as a multivariable estimation problem; second,
                 based on the complexity correlation of the neighbouring
                 CTU, an optimal direction is selected in five
                 directions for reference CTU set selection during model
                 parameter estimation to further improve the prediction
                 accuracy of the complexity of the current CTU. Third,
                 to improve their precision, the relationship between
                 the model parameters and the complexity of the
                 reference CTU set in the optimal direction is
                 established by using least square method (LS), and the
                 model parameters are solved via the estimated
                 complexity of the current CTU. Experimental results
                 show that the proposed algorithm can significantly
                 improve the accuracy of the CTU-level rate control and
                 thus the coding performance; the proposed scheme
                 consistently outperforms HM 16.0 and other
                 state-of-the-art algorithms in a variety of testing
                 configurations. More specifically, up to 8.4\% and on
                 average 6.4\% BD-Rate reduction is achieved compared to
                 HM 16.0 and up to 4.7\% and an average of 3.4\% BD-Rate
                 reduction is achieved compared to other algorithms,
                 with only a slight complexity overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sharrab:2017:MAP,
  author =       "Yousef O. Sharrab and Nabil J. Sarhan",
  title =        "Modeling and Analysis of Power Consumption in Live
                 Video Streaming Systems",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "54:1--54:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115505",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article develops an aggregate power consumption
                 model for live video streaming systems, including
                 many-to-many systems. In many-to-one streaming systems,
                 multiple video sources (i.e., cameras and/or sensors)
                 stream videos to a monitoring station. We model the
                 power consumed by the video sources in the capturing,
                 encoding, and transmission phases and then provide an
                 overall model in terms of the main capturing and
                 encoding parameters, including resolution, frame rate,
                 number of reference frames, motion estimation range,
                 and quantization. We also analyze the power consumed by
                 the monitoring station due to receiving, decoding, and
                 upscaling the received video streams. In addition to
                 modeling the power consumption, we model the achieved
                 bitrate of video encoding. We validate the developed
                 models through extensive experiments using two types of
                 systems and different video contents. Furthermore, we
                 analyze many-to-one systems in terms of bitrate, video
                 quality, and the power consumed by the sources, as well
                 as that by the monitoring station, considering the
                 impacts of multiple parameters simultaneously.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ng:2017:WSD,
  author =       "Pai Chet Ng and James She and Kang Eun Jeon and
                 Matthias Baldauf",
  title =        "When Smart Devices Interact With Pervasive Screens: a
                 Survey",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "55:1--55:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115933",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The meeting of pervasive screens and smart devices has
                 witnessed the birth of screen-smart device interaction
                 (SSI), a key enabler to many novel interactive use
                 cases. Most current surveys focus on direct
                 human-screen interaction, and to the best of our
                 knowledge, none have studied state-of-the-art SSI. This
                 survey identifies three core elements of SSI and
                 delivers a timely discussion on SSI oriented around the
                 screen, the smart device, and the interaction modality.
                 Two evaluation metrics (i.e., interaction latency and
                 accuracy) have been adopted and refined to match the
                 evaluation criterion of SSI. The bottlenecks that
                 hinder the further advancement of the current SSI in
                 connection with this metrics are studied. Last, future
                 research challenges and opportunities are highlighted
                 in the hope of inspiring continuous research efforts to
                 realize the next generation of SSI.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Franti:2017:MMO,
  author =       "Pasi Fr{\"a}nti and Radu Mariescu-Istodor and Lahari
                 Sengupta",
  title =        "{O-Mopsi}: Mobile Orienteering Game for Sightseeing,
                 Exercising, and Education",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "56:1--56:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115935",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Location-based games have been around already since
                 2000 but only recently when PokemonGo came to markets
                 it became clear that they can reach wide popularity. In
                 this article, we perform a literature-based analytical
                 study of what kind of issues location-based game design
                 faces, and how they can be solved. We study how to use
                 and verify the location, the role of the games as
                 exergames, use in education, and study technical and
                 safety issues. As a case study, we present O-Mopsi game
                 that combines physical activity with problem solving.
                 It includes three challenges: (1) navigating to the
                 next target, (2) deciding the order of targets, (3)
                 physical movement. All of them are unavoidable and
                 relevant. For guiding the players, we use three types
                 of multimedia: images (targets and maps), sound (user
                 guidance), and GPS (for positioning). We discuss
                 motivational aspects, analysis of the playing, and
                 content creation. The quality of experiences is
                 reported based on playing in SciFest Science festivals
                 during 2011--2016.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Messaoudi:2017:PAG,
  author =       "Farouk Messaoudi and Adlen Ksentini and Gwendal Simon
                 and Philippe Bertin",
  title =        "Performance Analysis of Game Engines on Mobile and
                 Fixed Devices",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "57:1--57:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115934",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mobile gaming is an emerging concept wherein gamers
                 are using mobile devices, like smartphones and tablets,
                 to play best-seller games. Compared to dedicated gaming
                 boxes or PCs, these devices still fall short of
                 executing newly complex 3D video games with a rich
                 immersion. Three novel solutions, relying on cloud
                 computing infrastructure, namely, computation
                 offloading, cloud gaming, and client-server
                 architecture, will represent the next generation of
                 game engine architecture aiming at improving the gaming
                 experience. The basis of these aforementioned solutions
                 is the distribution of the game code over different
                 devices (including set-top boxes, PCs, and servers). In
                 order to know how the game code should be distributed,
                 advanced knowledge of game engines is required. By
                 consequence, dissecting and analyzing game engine
                 performances will surely help to better understand how
                 to move in these new directions (i.e., distribute game
                 code), which is so far missing in the literature.
                 Aiming at filling this gap, we propose in this article
                 to analyze and evaluate one of the famous engines in
                 the market, that is, ``Unity 3D.'' We begin by
                 detailing the architecture and the game logic of game
                 engines. Then, we propose a test-bed to evaluate the
                 CPU and GPU consumption per frame and per module for
                 nine representative games on three platforms, namely, a
                 stand-alone computer, embedded systems, and web
                 players. Based on the obtained results and
                 observations, we build a valued graph of each module,
                 composing the Unity 3D architecture, which reflects the
                 internal flow and CPU consumption. Finally, we made a
                 comparison in terms of CPU consumption between these
                 architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheung:2017:ECF,
  author =       "Ming Cheung and Xiaopeng Li and James She",
  title =        "An Efficient Computation Framework for Connection
                 Discovery using Shared Images",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "58:1--58:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115951",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the advent and popularity of the social network,
                 social graphs become essential to improve services and
                 information relevance to users for many social media
                 applications to predict follower/followee relationship,
                 community membership, and so on. However, the social
                 graphs could be hidden by users due to privacy concerns
                 or kept by social media. Recently, connections
                 discovered from user-shared images using
                 machine-generated labels are proved to be more
                 accessible alternatives to social graphs. But real-time
                 discovery is difficult due to high complexity, and many
                 applications are not possible. This article proposes an
                 efficient computation framework for connection
                 discovery using user-shared images, which is suitable
                 for any image processing and computer vision techniques
                 for connection discovery on the fly. The framework
                 includes the architecture of online computation to
                 facilitate real-time processing, offline computation
                 for a complete processing, and online/offline
                 communication. The proposed framework is implemented to
                 demonstrate its effectiveness by speeding up connection
                 discovery through user-shared images. By studying 300K+
                 user-shared images from two popular social networks, it
                 is proven that the proposed computation framework
                 reduces 90\% of runtime with a comparable accurate with
                 existing frameworks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2017:DSF,
  author =       "Xiaopeng Li and Ming Cheung and James She",
  title =        "A Distributed Streaming Framework for Connection
                 Discovery Using Shared Videos",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "59:1--59:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3120996",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the advances in mobile devices and the popularity
                 of social networks, users can share multimedia content
                 anytime, anywhere. One of the most important types of
                 emerging content is video, which is commonly shared on
                 platforms such as Instagram and Facebook. User
                 connections, which indicate whether two users are
                 follower/followee or have the same interests, are
                 essential to improve services and information relevant
                 to users for many social media applications. But they
                 are normally hidden due to users' privacy concerns or
                 are kept confidential by social media sites. Using
                 user-shared content is an alternative way to discover
                 user connections. This article proposes to use
                 user-shared videos for connection discovery with the
                 Bag of Feature Tagging method and proposes a
                 distributed streaming computation framework to
                 facilitate the analytics. Exploiting the uniqueness of
                 shared videos, the proposed framework is divided into
                 Streaming processing and Online and Offline
                 Computation. With experiments using a dataset from
                 Twitter, it has been proved that the proposed method
                 using user-shared videos for connection discovery is
                 feasible. And the proposed computation framework
                 significantly accelerates the analytics, reducing the
                 processing time to only 32\% for follower/followee
                 recommendation. It has also been proved that comparable
                 performance can be achieved with only partial data for
                 each video and leads to more efficient computation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{DeBoer:2017:SRZ,
  author =       "Maaike H. T. {De Boer} and Yi-Jie Lu and Hao Zhang and
                 Klamer Schutte and Chong-Wah Ngo and Wessel Kraaij",
  title =        "Semantic Reasoning in Zero Example Video Event
                 Retrieval",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "60:1--60:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131288",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Searching in digital video data for high-level events,
                 such as a parade or a car accident, is challenging when
                 the query is textual and lacks visual example images or
                 videos. Current research in deep neural networks is
                 highly beneficial for the retrieval of high-level
                 events using visual examples, but without examples it
                 is still hard to (1) determine which concepts are
                 useful to pre-train ( Vocabulary challenge ) and (2)
                 which pre-trained concept detectors are relevant for a
                 certain unseen high-level event ( Concept Selection
                 challenge ). In our article, we present our Semantic
                 Event Retrieval System which (1) shows the importance
                 of high-level concepts in a vocabulary for the
                 retrieval of complex and generic high-level events and
                 (2) uses a novel concept selection method ( i-w2v )
                 based on semantic embeddings. Our experiments on the
                 international TRECVID Multimedia Event Detection
                 benchmark show that a diverse vocabulary including
                 high-level concepts improves performance on the
                 retrieval of high-level events in videos and that our
                 novel method outperforms a knowledge-based concept
                 selection method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guo:2017:EMD,
  author =       "Jianting Guo and Peijia Zheng and Jiwu Huang",
  title =        "An Efficient Motion Detection and Tracking Scheme for
                 Encrypted Surveillance Videos",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "61:1--61:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131342",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Performing detection on surveillance videos
                 contributes significantly to the goals of safety and
                 security. However, performing detection on unprotected
                 surveillance video may reveal the privacy of innocent
                 people in the video. Therefore, striking a proper
                 balance between maintaining personal privacy while
                 enhancing the feasibility of detection is an important
                 issue. One promising solution to this problem is to
                 encrypt the surveillance videos and perform detection
                 on the encrypted videos. Most existing encrypted signal
                 processing methods focus on still images or small data
                 volumes; however, because videos are typically much
                 larger, investigating how to process encrypted videos
                 is a significant challenge. In this article, we propose
                 an efficient motion detection and tracking scheme for
                 encrypted H.264/AVC video bitstreams, which does not
                 require the previous decryption on the encrypted video.
                 The main idea is to first estimate motion information
                 from the bitstream structure and codeword length and,
                 then, propose a region update (RU) algorithm to deal
                 with the loss and error drifting of motion caused by
                 the video encryption. The RU algorithm is designed
                 based on the prior knowledge that the object motion in
                 the video is continuous in space and time. Compared to
                 the existing scheme, which is based on video encryption
                 that occurs at the pixel level, the proposed scheme has
                 the advantages of requiring only a small storage of the
                 encrypted video and has a low computational cost for
                 both encryption and detection. Experimental results
                 show that our scheme performs better regarding
                 detection accuracy and execution speed. Moreover, the
                 proposed scheme can work with more than one
                 format-compliant video encryption method, provided that
                 the positions of the macroblocks can be extracted from
                 the encrypted video bitstream. Due to the coupling of
                 video stream encryption and detection algorithms, our
                 scheme can be directly connected to the video stream
                 output (e.g., surveillance cameras) without requiring
                 any camera modifications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Motamedi:2017:PPF,
  author =       "Mohammad Motamedi and Philipp Gysel and Soheil
                 Ghiasi",
  title =        "{PLACID}: a Platform for {FPGA}-Based Accelerator
                 Creation for {DCNNs}",
  journal =      j-TOMM,
  volume =       "13",
  number =       "4",
  pages =        "62:1--62:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131289",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Dec 23 10:49:23 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Deep Convolutional Neural Networks (DCNNs) exhibit
                 remarkable performance in a number of pattern
                 recognition and classification tasks. Modern DCNNs
                 involve many millions of parameters and billions of
                 operations. Inference using such DCNNs, if implemented
                 as software running on an embedded processor, results
                 in considerable execution time and energy consumption,
                 which is prohibitive in many mobile applications.
                 Field-programmable gate array (FPGA)-based acceleration
                 of DCNN inference is a promising approach to improve
                 both energy consumption and classification throughput.
                 However, the engineering effort required for
                 development and verification of an optimized FPGA-based
                 architecture is significant. In this article, we
                 present PLACID, an automated PLatform for Accelerator
                 CreatIon for DCNNs. PLACID uses an analytical approach
                 to characterization and exploration of the
                 implementation space. PLACID enables generation of an
                 accelerator with the highest throughput for a given
                 DCNN on a specific target FPGA platform. Subsequently,
                 it generates an RTL level architecture in Verilog,
                 which can be passed onto commercial tools for FPGA
                 implementation. PLACID is fully automated, and reduces
                 the accelerator design time from a few months down to a
                 few hours. Experimental results show that architectures
                 synthesized by PLACID yield 2$ \times $ higher
                 throughput density than the best competing approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Akputu:2018:ERU,
  author =       "Oryina Kingsley Akputu and Kah Phooi Seng and Yunli
                 Lee and Li-Minn Ang",
  title =        "Emotion Recognition Using Multiple Kernel Learning
                 toward E-learning Applications",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131287",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Adaptive Educational Hypermedia (AEH) e-learning
                 models aim to personalize educational content and
                 learning resources based on the needs of an individual
                 learner. The Adaptive Hypermedia Architecture (AHA) is
                 a specific implementation of the AEH model that
                 exploits the cognitive characteristics of learner
                 feedback to adapt resources accordingly. However,
                 beside cognitive feedback, the learning realm generally
                 includes both the affective and emotional feedback of
                 the learner, which is often neglected in the design of
                 e-learning models. This article aims to explore the
                 potential of utilizing affect or emotion recognition
                 research in AEH models. The framework is referred to as
                 Multiple Kernel Learning Decision Tree Weighted Kernel
                 Alignment (MKLDT-WFA). The MKLDT-WFA has two merits
                 over classical MKL. First, the WFA component only
                 preserves the relevant kernel weights to reduce
                 redundancy and improve the discrimination for emotion
                 classes. Second, training via the decision tree reduces
                 the misclassification issues associated with the
                 SimpleMKL. The proposed work has been evaluated on
                 different emotion datasets and the results confirm the
                 good performances. Finally, the conceptual
                 Emotion-based E-learning Model (EEM) with the proposed
                 emotion recognition framework is proposed for future
                 work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2018:LLP,
  author =       "Kai Li and Guo-Jun Qi and Kien A. Hua",
  title =        "Learning Label Preserving Binary Codes for Multimedia
                 Retrieval: a General Approach",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152126",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Learning-based hashing has been researched extensively
                 in the past few years due to its great potential in
                 fast and accurate similarity search among huge volumes
                 of multimedia data. In this article, we present a novel
                 multimedia hashing framework, called Label Preserving
                 Multimedia Hashing (LPMH) for multimedia similarity
                 search. In LPMH, a general optimization method is used
                 to learn the joint binary codes of multiple media types
                 by explicitly preserving semantic label information.
                 Compared with existing hashing methods which are
                 typically developed under and thus restricted to some
                 specific objective functions, the proposed optimization
                 strategy is not tied to any specific loss function and
                 can easily incorporate bit balance constraints to
                 produce well-balanced binary codes. Specifically, our
                 formulation leads to a set of Binary Integer
                 Programming (BIP) problems that have exact solutions
                 both with and without bit balance constraints. These
                 problems can be solved extremely fast and the solution
                 can easily scale up to large-scale datasets. In the
                 hash function learning stage, the boosted decision
                 trees algorithm is utilized to learn multiple
                 media-specific hash functions that can map
                 heterogeneous data sources into a homogeneous Hamming
                 space for cross-media retrieval. We have
                 comprehensively evaluated the proposed method using a
                 range of large-scale datasets in both single-media and
                 cross-media retrieval tasks. The experimental results
                 demonstrate that LPMH is competitive with
                 state-of-the-art methods in both speed and accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ceballos:2018:IEC,
  author =       "Rodrigo Ceballos and Beatrice Ionascu and Wanjoo Park
                 and Mohamad Eid",
  title =        "Implicit Emotion Communication: {EEG} Classification
                 and Haptic Feedback",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152128",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Today, ubiquitous digital communication systems do not
                 have an intuitive, natural way of communicating
                 emotion, which, in turn, affects the degree to which
                 humans can emotionally connect and interact with one
                 another. To address this problem, a more natural,
                 intuitive, and implicit emotion communication system
                 was designed and created that employs asymmetry-based
                 EEG emotion classification for detecting the emotional
                 state of the sender and haptic feedback (in the form of
                 tactile gestures) for displaying emotions for a
                 receiver. Emotions are modeled in terms of valence
                 (positive/negative emotions) and arousal (intensity of
                 the emotion). Performance analysis shows that the
                 proposed EEG subject-dependent emotion classification
                 model with Free Asymmetry features allows for more
                 flexible feature-generation schemes than other existing
                 algorithms and attains an average accuracy of 92.5\%
                 for valence and 96.5\% for arousal, outperforming
                 previous-generation schemes in high feature space. As
                 for the haptic feedback, a tactile gesture authoring
                 tool and a haptic jacket were developed to design
                 tactile gestures that can intensify emotional reactions
                 in terms of valence and arousal. Experimental study
                 demonstrated that subject-independent emotion
                 transmission through tactile gestures is effective for
                 the arousal dimension of an emotion but is less
                 effective for valence. Consistency in subject-dependent
                 responses for both valence and arousal suggests that
                 personalized tactile gestures would be more
                 effective.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2018:DAQ,
  author =       "Jiyan Wu and Bo Cheng and Yuan Yang and Ming Wang and
                 Junliang Chen",
  title =        "Delay-Aware Quality Optimization in Cloud-Assisted
                 Video Streaming System",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152116",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Cloud-assisted video streaming has emerged as a new
                 paradigm to optimize multimedia content distribution
                 over the Internet. This article investigates the
                 problem of streaming cloud-assisted real-time video to
                 multiple destinations (e.g., cloud video conferencing,
                 multi-player cloud gaming, etc.) over lossy
                 communication networks. The user diversity and network
                 dynamics result in the delay differences among multiple
                 destinations. This research proposes Differentiated
                 cloud-Assisted VIdeo Streaming (DAVIS) framework, which
                 proactively leverages such delay differences in video
                 coding and transmission optimization. First, we
                 analytically formulate the optimization problem of
                 joint coding and transmission to maximize received
                 video quality. Second, we develop a quality
                 optimization framework that integrates the video
                 representation selection and FEC (Forward Error
                 Correction) packet interleaving. The proposed DAVIS is
                 able to effectively perform differentiated quality
                 optimization for multiple destinations by taking
                 advantage of the delay differences in cloud-assisted
                 video streaming system. We conduct the performance
                 evaluation through extensive experiments with the
                 Amazon EC2 instances and Exata emulation platform.
                 Evaluation results show that DAVIS outperforms the
                 reference cloud-assisted streaming solutions in video
                 quality and delay performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jiang:2018:DBC,
  author =       "Shuhui Jiang and Yue Wu and Yun Fu",
  title =        "Deep Bidirectional Cross-Triplet Embedding for Online
                 Clothing Shopping",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152114",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we address the cross-domain (i.e.,
                 street and shop) clothing retrieval problem and
                 investigate its real-world applications for online
                 clothing shopping. It is a challenging problem due to
                 the large discrepancy between street and shop domain
                 images. We focus on learning an effective
                 feature-embedding model to generate robust and
                 discriminative feature representation across domains.
                 Existing triplet embedding models achieve promising
                 results by finding an embedding metric in which the
                 distance between negative pairs is larger than the
                 distance between positive pairs plus a margin. However,
                 existing methods do not address the challenges in the
                 cross-domain clothing retrieval scenario sufficiently.
                 First, the intradomain and cross-domain data
                 relationships need to be considered simultaneously.
                 Second, the number of matched and nonmatched
                 cross-domain pairs are unbalanced. To address these
                 challenges, we propose a deep cross-triplet embedding
                 algorithm together with a cross-triplet sampling
                 strategy. The extensive experimental evaluations
                 demonstrate the effectiveness of the proposed
                 algorithms well. Furthermore, we investigate two novel
                 online shopping applications, clothing trying on and
                 accessories recommendation, based on a unified
                 cross-domain clothing retrieval framework.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2018:DFI,
  author =       "Peisong Wang and Qinghao Hu and Zhiwei Fang and
                 Chaoyang Zhao and Jian Cheng",
  title =        "{DeepSearch}: a Fast Image Search Framework for Mobile
                 Devices",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152127",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Content-based image retrieval (CBIR) is one of the
                 most important applications of computer vision. In
                 recent years, there have been many important advances
                 in the development of CBIR systems, especially
                 Convolutional Neural Networks (CNNs) and other
                 deep-learning techniques. On the other hand, current
                 CNN-based CBIR systems suffer from high computational
                 complexity of CNNs. This problem becomes more severe as
                 mobile applications become more and more popular. The
                 current practice is to deploy the entire CBIR systems
                 on the server side while the client side only serves as
                 an image provider. This architecture can increase the
                 computational burden on the server side, which needs to
                 process thousands of requests per second. Moreover,
                 sending images have the potential of personal
                 information leakage. As the need of mobile search
                 expands, concerns about privacy are growing. In this
                 article, we propose a fast image search framework,
                 named DeepSearch, which makes complex image search
                 based on CNNs feasible on mobile phones. To implement
                 the huge computation of CNN models, we present a tensor
                 Block Term Decomposition (BTD) approach as well as a
                 nonlinear response reconstruction method to accelerate
                 the CNNs involving in object detection and feature
                 extraction. The extensive experiments on the ImageNet
                 dataset and Alibaba Large-scale Image Search Challenge
                 dataset show that the proposed accelerating approach
                 BTD can significantly speed up the CNN models and
                 further makes CNN-based image search practical on
                 common smart phones.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2018:RMV,
  author =       "Sicong Liu and Silvestro Roberto Poccia and K.
                 Sel{\c{c}}uk Candan and Maria Luisa Sapino and Xiaolan
                 Wang",
  title =        "Robust Multi-Variate Temporal Features of
                 Multi-Variate Time Series",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152123",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Many applications generate and/or consume
                 multi-variate temporal data, and experts often lack the
                 means to adequately and systematically search for and
                 interpret multi-variate observations. In this article,
                 we first observe that multi-variate time series often
                 carry localized multi-variate temporal features that
                 are robust against noise. We then argue that these
                 multi-variate temporal features can be extracted by
                 simultaneously considering, at multiple scales,
                 temporal characteristics of the time series along with
                 external knowledge, including variate relationships
                 that are known a priori. Relying on these observations,
                 we develop data models and algorithms to detect robust
                 multi-variate temporal (RMT) features that can be
                 indexed for efficient and accurate retrieval and can be
                 used for supporting data exploration and analysis
                 tasks. Experiments confirm that the proposed RMT
                 algorithm is highly effective and efficient in
                 identifying robust multi-scale temporal features of
                 multi-variate time series.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Guo:2018:OEL,
  author =       "Dan Guo and Wengang Zhou and Houqiang Li and Meng
                 Wang",
  title =        "Online Early-Late Fusion Based on Adaptive {HMM} for
                 Sign Language Recognition",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152121",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In sign language recognition (SLR) with multimodal
                 data, a sign word can be represented by multiply
                 features, for which there exist an intrinsic property
                 and a mutually complementary relationship among them.
                 To fully explore those relationships, we propose an
                 online early-late fusion method based on the adaptive
                 Hidden Markov Model (HMM). In terms of the intrinsic
                 property, we discover that inherent latent change
                 states of each sign are related not only to the number
                 of key gestures and body poses but also to their
                 translation relationships. We propose an adaptive HMM
                 method to obtain the hidden state number of each sign
                 by affinity propagation clustering. For the
                 complementary relationship, we propose an online
                 early-late fusion scheme. The early fusion (feature
                 fusion) is dedicated to preserving useful information
                 to achieve a better complementary score, while the late
                 fusion (score fusion) uncovers the significance of
                 those features and aggregates them in a weighting
                 manner. Different from classical fusion methods, the
                 fusion is query adaptive. For different queries, after
                 feature selection (including the combined feature), the
                 fusion weight is inversely proportional to the area
                 under the curve of the normalized query score list for
                 each selected feature. The whole fusion process is
                 effective and efficient. Experiments verify the
                 effectiveness on the signer-independent SLR with large
                 vocabulary. Compared either on different dataset sizes
                 or to different SLR models, our method demonstrates
                 consistent and promising performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2018:JEA,
  author =       "Huei-Fang Yang and Bo-Yao Lin and Kuang-Yu Chang and
                 Chu-Song Chen",
  title =        "Joint Estimation of Age and Expression by Combining
                 Scattering and Convolutional Networks",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152118",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article tackles the problem of joint estimation
                 of human age and facial expression. This is an
                 important yet challenging problem because expressions
                 can alter face appearances in a similar manner to human
                 aging. Different from previous approaches that deal
                 with the two tasks independently, our approach trains a
                 convolutional neural network (CNN) model that unifies
                 ordinal regression and multi-class classification in a
                 single framework. We demonstrate experimentally that
                 our method performs more favorably against
                 state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2018:EHD,
  author =       "Shao Huang and Weiqiang Wang and Shengfeng He and
                 Rynson W. H. Lau",
  title =        "Egocentric Hand Detection Via Dynamic Region Growing",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152129",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Egocentric videos, which mainly record the activities
                 carried out by the users of wearable cameras, have
                 drawn much research attention in recent years. Due to
                 its lengthy content, a large number of ego-related
                 applications have been developed to abstract the
                 captured videos. As the users are accustomed to
                 interacting with the target objects using their own
                 hands, while their hands usually appear within their
                 visual fields during the interaction, an egocentric
                 hand detection step is involved in tasks like gesture
                 recognition, action recognition, and social interaction
                 understanding. In this work, we propose a dynamic
                 region-growing approach for hand region detection in
                 egocentric videos, by jointly considering hand-related
                 motion and egocentric cues. We first determine seed
                 regions that most likely belong to the hand, by
                 analyzing the motion patterns across successive frames.
                 The hand regions can then be located by extending from
                 the seed regions, according to the scores computed for
                 the adjacent superpixels. These scores are derived from
                 four egocentric cues: contrast, location, position
                 consistency, and appearance continuity. We discuss how
                 to apply the proposed method in real-life scenarios,
                 where multiple hands irregularly appear and disappear
                 from the videos. Experimental results on public
                 datasets show that the proposed method achieves
                 superior performance compared with the state-of-the-art
                 methods, especially in complicated scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wen:2018:VBR,
  author =       "Jiqing Wen and James She and Xiaopeng Li and Hui Mao",
  title =        "Visual Background Recommendation for Dance
                 Performances Using Deep Matrix Factorization",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The stage background is one of the most important
                 features for a dance performance, as it helps to create
                 the scene and atmosphere. In conventional dance
                 performances, the background images are usually
                 selected or designed by professional stage designers
                 according to the theme and the style of the dance. In
                 new media dance performances, the stage effects are
                 usually generated by media editing software. Selecting
                 or producing a dance background is quite challenging
                 and is generally carried out by skilled technicians.
                 The goal of the research reported in this article is to
                 ease this process. Instead of searching for background
                 images from the sea of available resources, dancers are
                 recommended images that they are more likely to use.
                 This work proposes the idea of a novel system to
                 recommend images based on content-based social
                 computing. The core part of the system is a
                 probabilistic prediction model to predict a dancer's
                 interests in candidate images through social platforms.
                 Different from traditional collaborative filtering or
                 content-based models, the model proposed here
                 effectively combines a dancer's social behaviors
                 (rating action, click action, etc.) with the visual
                 content of images shared by the dancer using deep
                 matrix factorization (DMF). With the help of such a
                 system, dancers can select from the recommended images
                 and set them as the backgrounds of their dance
                 performances through a media editor. According to the
                 experiment results, the proposed DMF model outperforms
                 the previous methods, and when the dataset is very
                 sparse, the proposed DMF model shows more significant
                 results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pan:2018:AFP,
  author =       "Zhaoqing Pan and Jianjun Lei and Yajuan Zhang and Fu
                 Lee Wang",
  title =        "Adaptive Fractional-Pixel Motion Estimation Skipped
                 Algorithm for Efficient {HEVC} Motion Estimation",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3159170",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "High-Efficiency Video Coding (HEVC) efficiently
                 addresses the storage and transmit problems of
                 high-definition videos, especially for 4K videos. The
                 variable-size Prediction Units (PUs)--based Motion
                 Estimation (ME) contributes a significant compression
                 rate to the HEVC encoder and also generates a huge
                 computation load. Meanwhile, high-level encoding
                 complexity prevents widespread adoption of the HEVC
                 encoder in multimedia systems. In this article, an
                 adaptive fractional-pixel ME skipped scheme is proposed
                 for low-complexity HEVC ME. First, based on the
                 property of the variable-size PUs--based ME process and
                 the video content partition relationship among
                 variable-size PUs, all inter-PU modes during a coding
                 unit encoding process are classified into root-type PU
                 mode and children-type PU modes. Then, according to the
                 ME result of the root-type PU mode, the
                 fractional-pixel ME of its children-type PU modes is
                 adaptively skipped. Simulation results show that,
                 compared to the original ME in HEVC reference software,
                 the proposed algorithm reduces ME encoding time by an
                 average of 63.22\% while encoding efficiency
                 performance is maintained.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zheng:2018:DLC,
  author =       "Zhedong Zheng and Liang Zheng and Yi Yang",
  title =        "A Discriminatively Learned {CNN} Embedding for Person
                 Reidentification",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3159171",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we revisit two popular convolutional
                 neural networks in person re-identification (re-ID):
                 verification and identification models. The two models
                 have their respective advantages and limitations due to
                 different loss functions. Here, we shed light on how to
                 combine the two models to learn more discriminative
                 pedestrian descriptors. Specifically, we propose a
                 Siamese network that simultaneously computes the
                 identification loss and verification loss. Given a pair
                 of training images, the network predicts the identities
                 of the two input images and whether they belong to the
                 same identity. Our network learns a discriminative
                 embedding and a similarity measurement at the same
                 time, thus taking full usage of the re-ID annotations.
                 Our method can be easily applied on different
                 pretrained networks. Albeit simple, the learned
                 embedding improves the state-of-the-art performance on
                 two public person re-ID benchmarks. Further, we show
                 that our architecture can also be applied to image
                 retrieval. The code is available at
                 \url{https://github.com/layumi/2016_person_re-ID}.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sun:2018:RPP,
  author =       "Weiwei Sun and Jiantao Zhou and Shuyuan Zhu and Yuan
                 Yan Tang",
  title =        "Robust Privacy-Preserving Image Sharing over Online
                 Social Networks {(OSNs)}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "1",
  pages =        "14:1--14:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3165265",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jan 16 18:18:12 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Sharing images online has become extremely easy and
                 popular due to the ever-increasing adoption of mobile
                 devices and online social networks (OSNs). The privacy
                 issues arising from image sharing over OSNs have
                 received significant attention in recent years. In this
                 article, we consider the problem of designing a secure,
                 robust, high-fidelity, storage-efficient image-sharing
                 scheme over Facebook, a representative OSN that is
                 widely accessed. To accomplish this goal, we first
                 conduct an in-depth investigation on the manipulations
                 that Facebook performs to the uploaded images. Assisted
                 by such knowledge, we propose a DCT-domain image
                 encryption/decryption framework that is robust against
                 these lossy operations. As verified theoretically and
                 experimentally, superior performance in terms of data
                 privacy, quality of the reconstructed images, and
                 storage cost can be achieved.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Berretti:2018:IAS,
  author =       "Stefano Berretti",
  title =        "Improved Audio Steganalytic Feature and Its
                 Applications in Audio Forensics",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "43:1--43:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190575",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Digital multimedia steganalysis has attracted wide
                 attention over the past decade. Currently, there are
                 many algorithms for detecting image steganography.
                 However, little research has been devoted to audio
                 steganalysis. Since the statistical properties of image
                 and audio files are quite different, features that are
                 effective in image steganalysis may not be effective
                 for audio. In this article, we design an improved audio
                 steganalytic feature set derived from both the time and
                 Mel-frequency domains for detecting some typical
                 steganography in the time domain, including LSB
                 matching, Hide4PGP, and Steghide. The experiment
                 results, evaluated on different audio sources,
                 including various music and speech clips of different
                 complexity, have shown that the proposed features
                 significantly outperform the existing ones. Moreover,
                 we use the proposed features to detect and further
                 identify some typical audio operations that would
                 probably be used in audio tampering. The extensive
                 experiment results have shown that the proposed
                 features also outperform the related forensic methods,
                 especially when the length of the audio clip is small,
                 such as audio clips with 800 samples. This is very
                 important in real forensic situations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gupta:2018:AGM,
  author =       "Abhinav Gupta and Divya Singhal",
  title =        "Analytical Global Median Filtering Forensics Based on
                 Moment Histograms",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "44:1--44:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3176650",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Median filtering forensics in images has gained wide
                 attention from researchers in recent years because of
                 its inherent nature of preserving visual traces.
                 Although many forensic methods are developed for median
                 filtering detection, probability of detection reduces
                 under JPEG compression at low-quality factors and for
                 low-resolution images. The feature set reduction is
                 also a challenging issue among existing detectors. In
                 this article, a 19-dimensional feature set is
                 analytically derived from image skewness and kurtosis
                 histograms. This new feature set is exploited for the
                 purpose of global median filtering forensics and
                 verified with exhaustive experimental results. The
                 efficacy of the method is tested on six popular
                 databases (UCID, BOWS2, BOSSBase, NRCS, RAISE, and DID)
                 and found that the new feature set uncovers filtering
                 traces for moderate, low JPEG post-compression and
                 low-resolution operation. Our proposed method yields
                 lowest probability of error and largest area under the
                 ROC curve for most of the test cases in comparison with
                 previous approaches. Some novel test cases are
                 introduced to thoroughly assess the benefits and
                 limitations of the proposed method. The obtained
                 results indicate that the proposed method would provide
                 an important tool to the field of passive image
                 forensics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2018:MSH,
  author =       "Min Huang and Song-Zhi Su and Hong-Bo Zhang and
                 Guo-Rong Cai and Dongying Gong and Donglin Cao and
                 Shao-Zi Li",
  title =        "Multifeature Selection for {$3$D} Human Action
                 Recognition",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "45:1--45:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177757",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In mainstream approaches for 3D human action
                 recognition, depth and skeleton features are combined
                 to improve recognition accuracy. However, this strategy
                 results in high feature dimensions and low
                 discrimination due to redundant feature vectors. To
                 solve this drawback, a multi-feature selection approach
                 for 3D human action recognition is proposed in this
                 paper. First, three novel single-modal features are
                 proposed to describe depth appearance, depth motion,
                 and skeleton motion. Second, a classification entropy
                 of random forest is used to evaluate the discrimination
                 of the depth appearance based features. Finally, one of
                 the three features is selected to recognize the sample
                 according to the discrimination evaluation.
                 Experimental results show that the proposed
                 multi-feature selection approach significantly
                 outperforms other approaches based on single-modal
                 feature and feature fusion.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mazaheri:2018:LMC,
  author =       "Amir Mazaheri and Boqing Gong and Mubarak Shah",
  title =        "Learning a Multi-Concept Video Retrieval Model with
                 Multiple Latent Variables",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "46:1--46:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3176647",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Effective and efficient video retrieval has become a
                 pressing need in the ``big video'' era. The objective
                 of this work is to provide a principled model for
                 computing the ranking scores of a video in response to
                 one or more concepts, where the concepts could be
                 directly supplied by users or inferred by the system
                 from the user queries. Indeed, how to deal with
                 multi-concept queries has become a central component in
                 modern video retrieval systems that accept text
                 queries. However, it has been long overlooked and
                 simply implemented by weighted averaging of the
                 corresponding concept detectors' scores. Our approach,
                 which can be considered as a latent ranking SVM,
                 integrates the advantages of various recent works in
                 text and image retrieval, such as choosing ranking over
                 structured prediction, modeling inter-dependencies
                 between querying concepts, and so on. Videos consist of
                 shots, and we use latent variables to account for the
                 mutually complementary cues within and across shots.
                 Concept labels of shots are scarce and noisy. We
                 introduce a simple and effective technique to make our
                 model robust to outliers. Our approach gives superior
                 performance when it is tested on not only the queries
                 seen at training but also novel queries, some of which
                 consist of more concepts than the queries used for
                 training.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tulilaulu:2018:DM,
  author =       "Aurora Tulilaulu and Matti Nelimarkka and Joonas
                 Paalasmaa and Daniel Johnson and Dan Ventura and Petri
                 Myllys and Hannu Toivonen",
  title =        "Data Musicalization",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "47:1--47:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184742",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Data musicalization is the process of automatically
                 composing music based on given data as an approach to
                 perceptualizing information artistically. The aim of
                 data musicalization is to evoke subjective experiences
                 in relation to the information rather than merely to
                 convey unemotional information objectively. This
                 article is written as a tutorial for readers interested
                 in data musicalization. We start by providing a
                 systematic characterization of musicalization
                 approaches, based on their inputs, methods, and
                 outputs. We then illustrate data musicalization
                 techniques with examples from several applications: one
                 that perceptualizes physical sleep data as music,
                 several that artistically compose music inspired by the
                 sleep data, one that musicalizes on-line chat
                 conversations to provide a perceptualization of
                 liveliness of a discussion, and one that uses
                 musicalization in a gamelike mobile application that
                 allows its users to produce music. We additionally
                 provide a number of electronic samples of music
                 produced by the different musicalization
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cornia:2018:PMA,
  author =       "Marcella Cornia and Lorenzo Baraldi and Giuseppe Serra
                 and Rita Cucchiara",
  title =        "Paying More Attention to Saliency: Image Captioning
                 with Saliency and Context Attention",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "48:1--48:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177745",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Image captioning has been recently gaining a lot of
                 attention thanks to the impressive achievements shown
                 by deep captioning architectures, which combine
                 Convolutional Neural Networks to extract image
                 representations and Recurrent Neural Networks to
                 generate the corresponding captions. At the same time,
                 a significant research effort has been dedicated to the
                 development of saliency prediction models, which can
                 predict human eye fixations. Even though saliency
                 information could be useful to condition an image
                 captioning architecture, by providing an indication of
                 what is salient and what is not, research is still
                 struggling to incorporate these two techniques. In this
                 work, we propose an image captioning approach in which
                 a generative recurrent neural network can focus on
                 different parts of the input image during the
                 generation of the caption, by exploiting the
                 conditioning given by a saliency prediction model on
                 which parts of the image are salient and which are
                 contextual. We show, through extensive quantitative and
                 qualitative experiments on large-scale datasets, that
                 our model achieves superior performance with respect to
                 captioning baselines with and without saliency and to
                 different state-of-the-art approaches combining
                 saliency and captioning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wen:2018:CEE,
  author =       "Longyin Wen and Honggang Qi and Siwei Lyu",
  title =        "Contrast Enhancement Estimation for Digital Image
                 Forensics",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "49:1--49:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183518",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Inconsistency in contrast enhancement can be used to
                 expose image forgeries. In this work, we describe a new
                 method to estimate contrast enhancement operations from
                 a single image. Our method takes advantage of the
                 nature of contrast enhancement as a mapping between
                 pixel values and the distinct characteristics it
                 introduces to the image pixel histogram. Our method
                 recovers the original pixel histogram and the contrast
                 enhancement simultaneously from a single image with an
                 iterative algorithm. Unlike previous works, our method
                 is robust in the presence of additive noise
                 perturbations that are used to hide the traces of
                 contrast enhancement. Furthermore, we also develop an
                 effective method to detect image regions undergone
                 contrast enhancement transformations that are different
                 from the rest of the image, and we use this method to
                 detect composite images. We perform extensive
                 experimental evaluations to demonstrate the efficacy
                 and efficiency of our method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jiang:2018:DMP,
  author =       "Yu-Gang Jiang and Minjun Li and Xi Wang and Wei Liu
                 and Xian-Sheng Hua",
  title =        "{DeepProduct}: Mobile Product Search With Portable
                 Deep Features",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "50:1--50:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184745",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Features extracted by deep networks have been popular
                 in many visual search tasks. This article studies deep
                 network structures and training schemes for mobile
                 visual search. The goal is to learn an effective yet
                 portable feature representation that is suitable for
                 bridging the domain gap between mobile user photos and
                 (mostly) professionally taken product images while
                 keeping the computational cost acceptable for
                 mobile-based applications. The technical contributions
                 are twofold. First, we propose an alternative of the
                 contrastive loss popularly used for training deep
                 Siamese networks, namely robust contrastive loss, where
                 we relax the penalty on some positive and negative
                 pairs to alleviate overfitting. Second, a simple
                 multitask fine-tuning scheme is leveraged to train the
                 network, which not only utilizes knowledge from the
                 provided training photo pairs but also harnesses
                 additional information from the large ImageNet dataset
                 to regularize the fine-tuning process. Extensive
                 experiments on challenging real-world datasets
                 demonstrate that both the robust contrastive loss and
                 the multitask fine-tuning scheme are effective, leading
                 to very promising results with a time cost suitable for
                 mobile product search scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ahmad:2018:EDM,
  author =       "Kashif Ahmad and Mohamed Lamine Mekhalfi and Nicola
                 Conci and Farid Melgani and Francesco {De Natale}",
  title =        "Ensemble of Deep Models for Event Recognition",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "51:1--51:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3199668",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we address the problem of recognizing
                 an event from a single related picture. Given the large
                 number of event classes and the limited information
                 contained in a single shot, the problem is known to be
                 particularly hard. To achieve a reliable detection, we
                 propose a combination of multiple classifiers, and we
                 compare three alternative strategies to fuse the
                 results of each classifier, namely: (i) induced order
                 weighted averaging operators, (ii) genetic algorithms,
                 and (iii) particle swarm optimization. Each method is
                 aimed at determining the optimal weights to be assigned
                 to the decision scores yielded by different deep
                 models, according to the relevant optimization
                 strategy. Experimental tests have been performed on
                 three event recognition datasets, evaluating the
                 performance of various deep models, both alone and
                 selectively combined. Experimental results demonstrate
                 that the proposed approach outperforms traditional
                 multiple classifier solutions based on uniform
                 weighting, and outperforms recent state-of-the-art
                 approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2018:UER,
  author =       "Wei Hu and Mozhdeh Seifi and Erik Reinhard",
  title =        "Over- and Under-Exposure Reconstruction of a Single
                 Plenoptic Capture",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2",
  pages =        "52:1--52:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3199514",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Light field images, for example, taken with plenoptic
                 cameras, offer interesting post-processing
                 opportunities, including depth-of-field management,
                 depth estimation, viewpoint selection, and 3D image
                 synthesis. Like most capture devices, however,
                 plenoptic cameras have a limited dynamic range, so that
                 over- and under-exposed areas in plenoptic images are
                 commonplace. We therefore present a straightforward and
                 robust plenoptic reconstruction technique based on the
                 observation that vignetting causes peripheral views to
                 receive less light than central views. Thus,
                 corresponding pixels in different views can be used to
                 reconstruct illumination, especially in areas where
                 information missing in one view is present in another.
                 Our algorithm accurately reconstructs under- and
                 over-exposed regions (known as declipping),
                 additionally affording an increase in peak luminance by
                 up to two f-stops, and a comparable lowering of the
                 noise floor. The key advantages of this approach are
                 that no hardware modifications are necessary to improve
                 the dynamic range, that no multiple exposure techniques
                 are required, and therefore that no ghosting or other
                 artifacts are introduced.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Skorin-Kapov:2018:GES,
  author =       "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias
                 Ho{\ss}feld and Kuan-Ta Chen",
  title =        "Guest Editorial: Special Issue on {``QoE Management
                 for Multimedia Services''}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "28:1--28:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3192332",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Skorin-Kapov:2018:SEC,
  author =       "Lea Skorin-Kapov and Mart{\'\i}n Varela and Tobias
                 Ho{\ss}feld and Kuan-Ta Chen",
  title =        "A Survey of Emerging Concepts and Challenges for {QoE}
                 Management of Multimedia Services",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "29:1--29:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3176648",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Quality of Experience (QoE) has received much
                 attention over the past years and has become a
                 prominent issue for delivering services and
                 applications. A significant amount of research has been
                 devoted to understanding, measuring, and modelling QoE
                 for a variety of media services. The next logical step
                 is to actively exploit that accumulated knowledge to
                 improve and manage the quality of multimedia services,
                 while at the same time ensuring efficient and
                 cost-effective network operations. Moreover, with many
                 different players involved in the end-to-end service
                 delivery chain, identifying the root causes of QoE
                 impairments and finding effective solutions for meeting
                 the end users' requirements and expectations in terms
                 of service quality is a challenging and complex
                 problem. In this article, we survey state-of-the-art
                 findings and present emerging concepts and challenges
                 related to managing QoE for networked multimedia
                 services. Going beyond a number of previously published
                 survey articles addressing the topic of QoE management,
                 we address QoE management in the context of ongoing
                 developments, such as the move to softwarized networks,
                 the exploitation of big data analytics and machine
                 learning, and the steady rise of new and immersive
                 services (e.g., augmented and virtual reality). We
                 address the implications of such paradigm shifts in
                 terms of new approaches in QoE modeling and the need
                 for novel QoE monitoring and management
                 infrastructures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhu:2018:MIV,
  author =       "Yi Zhu and Sharath Chandra Guntuku and Weisi Lin and
                 Gheorghita Ghinea and Judith A. Redi",
  title =        "Measuring Individual Video {QoE}: a Survey, and
                 Proposal for Future Directions Using Social Media",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "30:1--30:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183512",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The next generation of multimedia services have to be
                 optimized in a personalized way, taking user factors
                 into account for the evaluation of individual
                 experience. Previous works have investigated the
                 influence of user factors mostly in a controlled
                 laboratory environment which often includes a limited
                 number of users and fails to reflect real-life
                 environment. Social media, especially Facebook, provide
                 an interesting alternative for Internet-based
                 subjective evaluation. In this article, we develop (and
                 open-source) a Facebook application, named YouQ$^1$, as
                 an experimental platform for studying individual
                 experience for videos. Our results show that subjective
                 experiments based on YouQ can produce reliable results
                 as compared to a controlled laboratory experiment.
                 Additionally, YouQ has the ability to collect user
                 information automatically from Facebook, which can be
                 used for modeling individual experience.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Petrangeli:2018:QEC,
  author =       "Stefano Petrangeli and Jeroen {Van Der Hooft} and Tim
                 Wauters and Filip {De Turck}",
  title =        "Quality of Experience-Centric Management of Adaptive
                 Video Streaming Services: Status and Challenges",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "31:1--31:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3165266",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Video streaming applications currently dominate
                 Internet traffic. Particularly, HTTP Adaptive Streaming
                 (HAS) has emerged as the dominant standard for
                 streaming videos over the best-effort Internet, thanks
                 to its capability of matching the video quality to the
                 available network resources. In HAS, the video client
                 is equipped with a heuristic that dynamically decides
                 the most suitable quality to stream the content, based
                 on information such as the perceived network bandwidth
                 or the video player buffer status. The goal of this
                 heuristic is to optimize the quality as perceived by
                 the user, the so-called Quality of Experience (QoE).
                 Despite the many advantages brought by the adaptive
                 streaming principle, optimizing users' QoE is far from
                 trivial. Current heuristics are still suboptimal when
                 sudden bandwidth drops occur, especially in wireless
                 environments, thus leading to freezes in the video
                 playout, the main factor influencing users' QoE. This
                 issue is aggravated in case of live events, where the
                 player buffer has to be kept as small as possible in
                 order to reduce the playout delay between the user and
                 the live signal. In light of the above, in recent
                 years, several works have been proposed with the aim of
                 extending the classical purely client-based structure
                 of adaptive video streaming, in order to fully optimize
                 users' QoE. In this article, a survey is presented of
                 research works on this topic together with a
                 classification based on where the optimization takes
                 place. This classification goes beyond client-based
                 heuristics to investigate the usage of server- and
                 network-assisted architectures and of new application
                 and transport layer protocols. In addition, we outline
                 the major challenges currently arising in the field of
                 multimedia delivery, which are going to be of extreme
                 relevance in future years.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bhat:2018:SNA,
  author =       "Divyashri Bhat and Amr Rizk and Michael Zink and Ralf
                 Steinmetz",
  title =        "{SABR}: Network-Assisted Content Distribution for
                 {QoE}-Driven {ABR} Video Streaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "32:1--32:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183516",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "State-of-the-art software-defined wide area networks
                 (SD-WANs) provide the foundation for flexible and
                 highly resilient networking. In this work, we design,
                 implement, and evaluate a novel architecture (denoted
                 as SABR) that leverages the benefits of
                 software-defined networking (SDN) to provide
                 network-assisted adaptive bitrate streaming. With
                 clients retaining full control of their streaming
                 algorithms, we clearly show that by this network
                 assistance, both the clients and the content providers
                 benefit significantly in terms of quality of experience
                 (QoE) and content origin offloading. SABR utilizes
                 information on available bandwidths per link and
                 network cache contents to guide video streaming clients
                 with the goal of improving the viewer's QoE. In
                 addition, SABR uses SDN capabilities to dynamically
                 program flows to optimize the utilization of content
                 delivery network caches. Backed by our study of
                 SDN-assisted streaming, we discuss the change in the
                 requirements for network-to-player APIs that enables
                 flexible video streaming. We illustrate the difficulty
                 of the problem and the impact of SDN-assisted streaming
                 on QoE metrics using various well-established player
                 algorithms. We evaluate SABR together with
                 state-of-the-art dynamic adaptive streaming over HTTP
                 (DASH) quality adaptation algorithms through a series
                 of experiments performed on a real-world, SDN-enabled
                 testbed network with minimal modifications to an
                 existing DASH client. In addition, we compare the
                 performance of different caching strategies in
                 combination with SABR. Our trace-based measurements
                 show the substantial improvement in cache hit rates and
                 QoE metrics in conjunction with SABR indicating a rich
                 design space for jointly optimized SDN-assisted caching
                 architectures for adaptive bitrate video streaming
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Burger:2018:GAV,
  author =       "Valentin Burger and Thomas Zinner and Lam Dinh-Xuan
                 and Florian Wamser and Phuoc Tran-Gia",
  title =        "A Generic Approach to Video Buffer Modeling Using
                 Discrete-Time Analysis",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "33:1--33:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183511",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The large share of traffic in the Internet generated
                 by video streaming services puts high loads on access
                 and aggregation networks, resulting in high costs for
                 the content delivery infrastructure. To reduce the
                 bandwidth consumed while maintaining a high playback
                 quality, video players use policies that control and
                 limit the buffer level by using thresholds for pausing
                 and continuing the video download. This allows shaping
                 the bandwidth consumed by video streams and limiting
                 the traffic wasted in case of playback abortion.
                 Especially in mobile scenarios, where the throughput
                 can be highly variant, the buffer policy can have a
                 high impact on the probability of interruptions during
                 video playback. To find the optimal setting for the
                 buffer policy in each network condition, the
                 relationship between the parameters of the buffer
                 policy, the network throughput dynamics, and the
                 corresponding video playback behavior needs to be
                 understood. To this end, we model the video buffer as
                 GI/GI/1 queue with pq -policy using discrete-time
                 analysis. By studying the stochastic properties of the
                 buffer-level distribution, we are able to accurately
                 evaluate the impact of network and video bitrate
                 dynamics on the video playback quality based on the
                 buffer policy. We find a fundamental relationship
                 between the bandwidth variation and the expected
                 interarrival time of segments, meaning that
                 overproportionately more bandwidth is necessary to
                 prevent stalling events for high bandwidth variation.
                 The proposed model further allows to optimize the
                 trade-off between the traffic wasted in case of video
                 abortion and video streaming quality experienced by the
                 user.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Siekkinen:2018:CYS,
  author =       "Matti Siekkinen and Teemu k{\"a}m{\"a}r{\"a}inen and
                 Leonardo Favario and Enrico Masala",
  title =        "Can You See What {I} See? {Quality}-of-Experience
                 Measurements of Mobile Live Video Broadcasting",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "34:1--34:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3165279",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Broadcasting live video directly from mobile devices
                 is rapidly gaining popularity with applications like
                 Periscope and Facebook Live. The quality of experience
                 (QoE) provided by these services comprises many
                 factors, such as quality of transmitted video, video
                 playback stalling, end-to-end latency, and impact on
                 battery life, and they are not yet well understood. In
                 this article, we examine mainly the Periscope service
                 through a comprehensive measurement study and compare
                 it in some aspects to Facebook Live. We shed light on
                 the usage of Periscope through analysis of crawled data
                 and then investigate the aforementioned QoE factors
                 through statistical analyses as well as controlled
                 small-scale measurements using a couple of different
                 smartphones and both versions, Android and iOS, of the
                 two applications. We report a number of findings
                 including the discrepancy in latency between the two
                 most commonly used protocols, RTMP and HLS, surprising
                 surges in bandwidth demand caused by the Periscope
                 app's chat feature, substantial variations in video
                 quality, poor adaptation of video bitrate to available
                 upstream bandwidth at the video broadcaster side, and
                 significant power consumption caused by the
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bruneau-Queyreix:2018:PNS,
  author =       "Joachim Bruneau-Queyreix and Jordi Mongay Batalla and
                 Mathias Lacaud and Daniel Negru",
  title =        "{PMS}: a Novel Scale-Adaptive and Quality-Adaptive
                 Hybrid {P2P\slash} Multisource Solution for Live
                 Streaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "35:1--35:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183515",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Single-source HTTP adaptive streaming solutions (HAS)
                 have become the de facto solutions to deliver live
                 video over the Internet. By avoiding video stalling
                 events that are mainly caused by the lack of throughput
                 at client or at server side, HAS solutions increase the
                 end users' quality of experience (QoE). We propose to
                 pragmatically extend HAS with our MS-Stream solution
                 that simultaneously utilizes several servers. MS-Stream
                 aims at offering high QoE for live content delivery by
                 exploiting expanded bandwidth and link diversity in
                 distributed heterogeneous infrastructures. By
                 leveraging end users' connectivity capacities, we
                 further extend the QoE and scalability capabilities of
                 our proposal by exposing a hybrid P2P/multisource
                 live-streaming solution (P2P/MS-Stream (PMS)),
                 achieving trade-offs between the system's scale and the
                 end users' QoE. We propose a distributed quality
                 adaptation algorithm run by every peer, along with a
                 local optimization method of the usage of the server
                 infrastructure made available. Large-scale evaluations
                 conducted with 300 peers located in France permits
                 validating our approach and algorithms over flash crowd
                 events and allow us to conclude that PMS can reach the
                 optimal trade-offs between QoE and system scale.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Floris:2018:QAO,
  author =       "Alessandro Floris and Arslan Ahmad and Luigi Atzori",
  title =        "{QoE}-Aware {OTT-ISP} Collaboration in Service
                 Management: Architecture and Approaches",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "36:1--36:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183517",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "It is a matter of fact that quality of experience
                 (QoE) has become one of the key factors determining
                 whether a new multimedia service will be successfully
                 accepted by the final users. Accordingly, several QoE
                 models have been developed with the aim of capturing
                 the perception of the user by considering as many
                 influencing factors as possible. However, when it comes
                 to adopting these models in the management of the
                 services and networks, it frequently happens that no
                 single provider has access to all of the tools to
                 either measure all influencing factors parameters or
                 control over the delivered quality. In particular, it
                 often happens to the over-the-top (OTT) and Internet
                 service providers (ISPs), which act with complementary
                 roles in the service delivery over the Internet. On the
                 basis of this consideration, in this article we first
                 highlight the importance of a possible OTT-ISP
                 collaboration for a joint service management in terms
                 of technical and economic aspects. Then we propose a
                 general reference architecture for a possible
                 collaboration and information exchange among them.
                 Finally, we define three different approaches, namely
                 joint venture, customer lifetime value based, and QoE
                 fairness based. The first aims to maximize the revenue
                 by providing better QoE to customers paying more. The
                 second aims to maximize the profit by providing better
                 QoE to the most profitable customers (MPCs). The third
                 aims to maximize QoE fairness among all customers.
                 Finally, we conduct simulations to compare the three
                 approaches in terms of QoE provided to the users,
                 profit generated for the providers, and QoE fairness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yan:2018:GES,
  author =       "Yan Yan and Liqiang Nie and Rita Cucchiara",
  title =        "Guest Editorial: Special Section on {``Multimedia
                 Understanding via Multimodal Analytics''}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "37:1--37:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3192334",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tiwari:2018:MMS,
  author =       "Akanksha Tiwari and Christian {Von Der Weth} and Mohan
                 S. Kankanhalli",
  title =        "Multimodal Multiplatform Social Media Event
                 Summarization",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "38:1--38:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115433",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Social media platforms are turning into important news
                 sources since they provide real-time information from
                 different perspectives. However, high volume, dynamism,
                 noise, and redundancy exhibited by social media data
                 make it difficult to comprehend the entire content.
                 Recent works emphasize on summarizing the content of
                 either a single social media platform or of a single
                 modality (either textual or visual). However, each
                 platform has its own unique characteristics and user
                 base, which brings to light different aspects of
                 real-world events. This makes it critical as well as
                 challenging to combine textual and visual data from
                 different platforms. In this article, we propose
                 summarization of real-world events with data stemming
                 from different platforms and multiple modalities. We
                 present the use of a Markov Random Fields based
                 similarity measure to link content across multiple
                 platforms. This measure also enables the linking of
                 content across time, which is useful for tracking the
                 evolution of long-running events. For the final content
                 selection, summarization is modeled as a subset
                 selection problem. To handle the complexity of the
                 optimal subset selection, we propose the use of
                 submodular objectives. Facets such as coverage,
                 novelty, and significance are modeled as submodular
                 objectives in a multimodal social media setting. We
                 conduct a series of quantitative and qualitative
                 experiments to illustrate the effectiveness of our
                 approach compared to alternative methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2018:SAM,
  author =       "Anran Wang and Jianfei Cai and Jiwen Lu and Tat-Jen
                 Cham",
  title =        "Structure-Aware Multimodal Feature Fusion for {RGB-D}
                 Scene Classification and Beyond",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "39:1--39:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115932",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "While convolutional neural networks (CNNs) have been
                 excellent for object recognition, the greater spatial
                 variability in scene images typically means that the
                 standard full-image CNN features are suboptimal for
                 scene classification. In this article, we investigate a
                 framework allowing greater spatial flexibility, in
                 which the Fisher vector (FV)-encoded distribution of
                 local CNN features, obtained from a multitude of region
                 proposals per image, is considered instead. The CNN
                 features are computed from an augmented pixel-wise
                 representation consisting of multiple modalities of
                 RGB, HHA, and surface normals, as extracted from RGB-D
                 data. More significantly, we make two postulates: (1)
                 component sparsity-that only a small variety of region
                 proposals and their corresponding FV GMM components
                 contribute to scene discriminability, and (2) modal
                 nonsparsity-that features from all modalities are
                 encouraged to coexist. In our proposed feature fusion
                 framework, these are implemented through regularization
                 terms that apply group lasso to GMM components and
                 exclusive group lasso across modalities. By learning
                 and combining regressors for both proposal-based FV
                 features and global CNN features, we are able to
                 achieve state-of-the-art scene classification
                 performance on the SUNRGBD Dataset and NYU Depth
                 Dataset V2. Moreover, we further apply our feature
                 fusion framework on an action recognition task to
                 demonstrate that our framework can be generalized for
                 other multimodal well-structured features. In
                 particular, for action recognition, we enforce
                 interpart sparsity to choose more discriminative body
                 parts, and intermodal nonsparsity to make informative
                 features from both appearance and motion modalities
                 coexist. Experimental results on the JHMDB and MPII
                 Cooking Datasets show that our feature fusion is also
                 very effective for action recognition, achieving very
                 competitive performance compared with the state of the
                 art.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2018:ICD,
  author =       "Cheng Wang and Haojin Yang and Christoph Meinel",
  title =        "Image Captioning with Deep Bidirectional {LSTMs} and
                 Multi-Task Learning",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "40:1--40:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3115432",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Generating a novel and descriptive caption of an image
                 is drawing increasing interests in computer vision,
                 natural language processing, and multimedia
                 communities. In this work, we propose an end-to-end
                 trainable deep bidirectional LSTM (Bi-LSTM (Long
                 Short-Term Memory)) model to address the problem. By
                 combining a deep convolutional neural network (CNN) and
                 two separate LSTM networks, our model is capable of
                 learning long-term visual-language interactions by
                 making use of history and future context information at
                 high-level semantic space. We also explore deep
                 multimodal bidirectional models, in which we increase
                 the depth of nonlinearity transition in different ways
                 to learn hierarchical visual-language embeddings. Data
                 augmentation techniques such as multi-crop,
                 multi-scale, and vertical mirror are proposed to
                 prevent overfitting in training deep models. To
                 understand how our models ``translate'' image to
                 sentence, we visualize and qualitatively analyze the
                 evolution of Bi-LSTM internal states over time. The
                 effectiveness and generality of proposed models are
                 evaluated on four benchmark datasets: Flickr8K,
                 Flickr30K, MSCOCO, and Pascal1K datasets. We
                 demonstrate that Bi-LSTM models achieve highly
                 competitive performance on both caption generation and
                 image-sentence retrieval even without integrating an
                 additional mechanism (e.g., object detection, attention
                 model). Our experiments also prove that multi-task
                 learning is beneficial to increase model generality and
                 gain performance. We also demonstrate the performance
                 of transfer learning of the Bi-LSTM model significantly
                 outperforms previous methods on the Pascal1K dataset.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2018:TPA,
  author =       "Zhenguang Liu and Yingjie Xia and Qi Liu and Qinming
                 He and Chao Zhang and Roger Zimmermann",
  title =        "Toward Personalized Activity Level Prediction in
                 Community Question Answering {Websites}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "2s",
  pages =        "41:1--41:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3187011",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue May 29 08:39:06 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Community Question Answering (CQA) websites have
                 become valuable knowledge repositories. Millions of
                 internet users resort to CQA websites to seek answers
                 to their encountered questions. CQA websites provide
                 information far beyond a search on a site such as
                 Google due to (1) the plethora of high-quality answers,
                 and (2) the capabilities to post new questions toward
                 the communities of domain experts. While most research
                 efforts have been made to identify experts or to
                 preliminarily detect potential experts of CQA websites,
                 there has been a remarkable shift toward investigating
                 how to keep the engagement of experts. Experts are
                 usually the major contributors of high-quality answers
                 and questions of CQA websites. Consequently, keeping
                 the expert communities active is vital to improving the
                 lifespan of these websites. In this article, we present
                 an algorithm termed PALP to predict the activity level
                 of expert users of CQA websites. To the best of our
                 knowledge, PALP is the first approach to address a
                 personalized activity level prediction model for CQA
                 websites. Furthermore, it takes into consideration user
                 behavior change over time and focuses specifically on
                 expert users. Extensive experiments on the Stack
                 Overflow website demonstrate the competitiveness of
                 PALP over existing methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Abdallah:2018:AHD,
  author =       "Maha Abdallah",
  title =        "Aesthetic Highlight Detection in Movies Based on
                 Synchronization of Spectators' Reactions",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "68:1--68:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3175497",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Detection of aesthetic highlights is a challenge for
                 understanding the affective processes taking place
                 during movie watching. In this article, we study
                 spectators' responses to movie aesthetic stimuli in a
                 social context. Moreover, we look for uncovering the
                 emotional component of aesthetic highlights in movies.
                 Our assumption is that synchronized spectators'
                 physiological and behavioral reactions occur during
                 these highlights because: (i) aesthetic choices of
                 filmmakers are made to elicit specific emotional
                 reactions (e.g., special effects, empathy, and
                 compassion toward a character) and (ii) watching a
                 movie together causes spectators' affective reactions
                 to be synchronized through emotional contagion. We
                 compare different approaches to estimation of
                 synchronization among multiple spectators' signals,
                 such as pairwise, group, and overall synchronization
                 measures to detect aesthetic highlights in movies. The
                 results show that the unsupervised architecture relying
                 on synchronization measures is able to capture
                 different properties of spectators' synchronization and
                 detect aesthetic highlights based on both spectators'
                 electrodermal and acceleration signals. We discover
                 that pairwise synchronization measures perform the most
                 accurately independently of the category of the
                 highlights and movie genres. Moreover, we observe that
                 electrodermal signals have more discriminative power
                 than acceleration signals for highlight detection.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bai:2018:ADA,
  author =       "Yalong Bai and Kuiyuan Yang and Tao Mei and Wei-Ying
                 Ma and Tiejun Zhao",
  title =        "Automatic Data Augmentation from Massive {Web} Images
                 for Deep Visual Recognition",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "69:1--69:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3204941",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Large-scale image datasets and deep convolutional
                 neural networks (DCNNs) are the two primary driving
                 forces for the rapid progress in generic object
                 recognition tasks in recent years. While lots of
                 network architectures have been continuously designed
                 to pursue lower error rates, few efforts are devoted to
                 enlarging existing datasets due to high labeling costs
                 and unfair comparison issues. In this article, we aim
                 to achieve lower error rates by augmenting existing
                 datasets in an automatic manner. Our method leverages
                 both the web and DCNN, where the web provides massive
                 images with rich contextual information, and DCNN
                 replaces humans to automatically label images under the
                 guidance of web contextual information. Experiments
                 show that our method can automatically scale up
                 existing datasets significantly from billions of web
                 pages with high accuracy. The performance on object
                 recognition tasks and transfer learning tasks have been
                 significantly improved by using the automatically
                 augmented datasets, which demonstrates that more
                 supervisory information has been automatically gathered
                 from the web. Both the dataset and models trained on
                 the dataset have been made publicly available.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tan:2018:UCD,
  author =       "Min Tan and Jun Yu and Zhou Yu and Fei Gao and Yong
                 Rui and Dacheng Tao",
  title =        "User-Click-Data-Based Fine-Grained Image Recognition
                 via Weakly Supervised Metric Learning",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "70:1--70:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209666",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We present a novel fine-grained image recognition
                 framework using user click data, which can bridge the
                 semantic gap in distinguishing categories that are
                 similar in visual. As query set in click data is
                 usually large-scale and redundant, we first propose a
                 click-feature-based query-merging approach to merge
                 queries with similar semantics and construct a compact
                 click feature. Afterward, we utilize this compact click
                 feature and convolutional neural network (CNN)-based
                 deep visual feature to jointly represent an image.
                 Finally, with the combined feature, we employ the
                 metric-learning-based template-matching scheme for
                 efficient recognition. Considering the heavy noise in
                 the training data, we introduce a reliability variable
                 to characterize the image reliability, and propose a
                 weakly-supervised metric and template leaning with
                 smooth assumption and click prior (WMTLSC) method to
                 jointly learn the distance metric, object templates,
                 and image reliability. Extensive experiments are
                 conducted on a public Clickture-Dog dataset and our
                 newly established Clickture-Bird dataset. It is shown
                 that the click-data-based query merging helps
                 generating a highly compact (the dimension is reduced
                 to 0.9\%) and dense click feature for images, which
                 greatly improves the computational efficiency. Also,
                 introducing this click feature into CNN feature further
                 boosts the recognition accuracy. The proposed framework
                 performs much better than previous state-of-the-arts in
                 fine-grained recognition tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bentaleb:2018:OSO,
  author =       "Abdelhak Bentaleb and Ali C. Begen and Roger
                 Zimmermann",
  title =        "{ORL--SDN}: Online Reinforcement Learning for
                 {SDN}-Enabled {HTTP} Adaptive Streaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "71:1--71:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3219752",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In designing an HTTP adaptive streaming (HAS) system,
                 the bitrate adaptation scheme in the player is a key
                 component to ensure a good quality of experience (QoE)
                 for viewers. We propose a new online reinforcement
                 learning optimization framework, called ORL-SDN,
                 targeting HAS players running in a software-defined
                 networking (SDN) environment. We leverage SDN to
                 facilitate the orchestration of the adaptation schemes
                 for a set of HAS players. To reach a good level of QoE
                 fairness in a large population of players, we cluster
                 them based on a perceptual quality index. We formulate
                 the adaptation process as a Partially Observable Markov
                 Decision Process and solve the per-cluster optimization
                 problem using an online Q-learning technique that
                 leverages model predictive control and parallelism via
                 aggregation to avoid a per-cluster suboptimal selection
                 and to accelerate the convergence to an optimum. This
                 framework achieves maximum long-term revenue by
                 selecting the optimal representation for each cluster
                 under time-varying network conditions. The results show
                 that ORL-SDN delivers substantial improvements in
                 viewer QoE, presentation quality stability, fairness,
                 and bandwidth utilization over well-known adaptation
                 schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kong:2018:EVE,
  author =       "Lingchao Kong and Rui Dai",
  title =        "Efficient Video Encoding for Automatic Video Analysis
                 in Distributed Wireless Surveillance Systems",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "72:1--72:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226036",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In many distributed wireless surveillance
                 applications, compressed videos are used for performing
                 automatic video analysis tasks. The accuracy of object
                 detection, which is essential for various video
                 analysis tasks, can be reduced due to video quality
                 degradation caused by lossy compression. This article
                 introduces a video encoding framework with the
                 objective of boosting the accuracy of object detection
                 for wireless surveillance applications. The proposed
                 video encoding framework is based on systematic
                 investigation of the effects of lossy compression on
                 object detection. It has been found that current
                 standardized video encoding schemes cause temporal
                 domain fluctuation for encoded blocks in stable
                 background areas and spatial texture degradation for
                 encoded blocks in dynamic foreground areas of a raw
                 video, both of which degrade the accuracy of object
                 detection. Two measures, the sum-of-absolute frame
                 difference (SFD) and the degradation of texture in 2D
                 transform domain (TXD), are introduced to depict the
                 temporal domain fluctuation and the spatial texture
                 degradation in an encoded video, respectively. The
                 proposed encoding framework is designed to suppress
                 unnecessary temporal fluctuation in stable background
                 areas and preserve spatial texture in dynamic
                 foreground areas based on the two measures, and it
                 introduces new mode decision strategies for both intra-
                 and interframes to improve the accuracy of object
                 detection while maintaining an acceptable rate
                 distortion performance. Experimental results show that,
                 compared with traditional encoding schemes, the
                 proposed scheme improves the performance of object
                 detection and results in lower bit rates and
                 significantly reduced complexity with comparable
                 quality in terms of PSNR and SSIM.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2018:ICA,
  author =       "Anqi Wang and Haifeng Hu and Liang Yang",
  title =        "Image Captioning with Affective Guiding and Selective
                 Attention",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "73:1--73:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226037",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Image captioning is an increasingly important problem
                 associated with artificial intelligence, computer
                 vision, and natural language processing. Recent works
                 revealed that it is possible for a machine to generate
                 meaningful and accurate sentences for images. However,
                 most existing methods ignore latent emotional
                 information in an image. In this article, we propose a
                 novel image captioning model with Affective Guiding and
                 Selective Attention Mechanism named AG-SAM. In our
                 method, we aim to bridge the affective gap between
                 image captioning and the emotional response elicited by
                 the image. First, we introduce affective components
                 that capture higher-level concepts encoded in images
                 into AG-SAM. Hence, our language model can be adapted
                 to generate sentences that are more passionate and
                 emotive. In addition, a selective gate acting on the
                 attention mechanism controls the degree of how much
                 visual information AG-SAM needs. Experimental results
                 have shown that our model outperforms most existing
                 methods, clearly reflecting an association between
                 images and emotional components that is usually ignored
                 in existing works.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sikora:2018:SAS,
  author =       "Marjan Sikora and Mladen Russo and Jurica Derek and
                 Ante Jurcevi{\'c}",
  title =        "Soundscape of an Archaeological Site Recreated with
                 Audio Augmented Reality",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "74:1--74:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230652",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article investigates the use of an audio
                 augmented reality (AAR) system to recreate the
                 soundscape of a medieval archaeological site. The aim
                 of our work was to explore whether it is possible to
                 enhance a tourist's archaeological experience, which is
                 often derived from only scarce remains. We developed a
                 smartphone-based AAR system, which uses location and
                 orientation sensors to synthesize the soundscape of a
                 site and plays it to the user via headphones. We
                 recreated the ancient soundscape of a medieval
                 archaeological site in Croatia and tested it in situ on
                 two groups of participants using the soundwalk method.
                 One test group performed the soundwalk while listening
                 to the recreated soundscape using the AAR system, while
                 the second control group did not use the AAR equipment.
                 We measured the experiences of the participants using
                 two methods: the standard soundwalk questionnaire and
                 affective computing equipment for detecting the
                 emotional state of participants. The results of both
                 test methods show that participants who were listening
                 to the ancient soundscape using our AAR system
                 experienced higher arousal than those visiting the site
                 without AAR.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kirchhoffer:2018:PDV,
  author =       "Heiner Kirchhoffer and Detlev Marpe and Heiko Schwarz
                 and Thomas Wiegand",
  title =        "Properties and Design of Variable-to-Variable Length
                 Codes",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "75:1--75:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230653",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "For the entropy coding of independent and identically
                 distributed (i.i.d.) binary sources,
                 variable-to-variable length (V2V) codes are an
                 interesting alternative to arithmetic coding. Such a
                 V2V code translates variable length words of the source
                 into variable length code words by employing two
                 prefix-free codes. In this article, several properties
                 of V2V codes are studied, and new concepts are
                 developed. In particular, it is shown that the
                 redundancy of a V2V code cannot be zero for a binary
                 i.i.d. source {X} with 0 \&lt; p$_X$ (1) \&lt; 0.5.
                 Furthermore, the concept of prime and composite V2V
                 codes is proposed, and it is shown why composite V2V
                 codes can be disregarded in the search for particular
                 classes of minimum redundancy codes. Moreover, a
                 canonical representation for V2V codes is proposed,
                 which identifies V2V codes that have the same average
                 code length function. It is shown how these concepts
                 can be employed to greatly reduce the complexity of a
                 search for minimum redundancy (size-limited) V2V
                 codes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kiess:2018:SCA,
  author =       "Johannes Kiess and Stephan Kopf and Benjamin Guthier
                 and Wolfgang Effelsberg",
  title =        "A Survey on Content-Aware Image and Video
                 Retargeting",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "76:1--76:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231598",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This survey introduces the current state of the art in
                 image and video retargeting and describes important
                 ideas and technologies that have influenced the recent
                 work. Retargeting is the process of adapting an image
                 or video from one screen resolution to another to fit
                 different displays, for example, when watching a wide
                 screen movie on a normal television screen or a mobile
                 device. As there has been considerable work done in
                 this field already, this survey provides an overview of
                 the techniques. It is meant to be a starting point for
                 new research in the field. We include explanations of
                 basic terms and operators, as well as the basic
                 workflow of the different methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cecil:2018:NBV,
  author =       "J. Cecil and Avinash Gupta and M. Pirela-Cruz and
                 Parmesh Ramanathan",
  title =        "A Network-Based Virtual Reality Simulation Training
                 Approach for Orthopedic Surgery",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "77:1--77:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3232678",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The focus of this article is on the adoption of
                 immersive and haptic simulators for training of medical
                 residents in a surgical process called Less Invasive
                 Stabilization System (LISS) plating surgery. LISS
                 surgery is an orthopedic surgical procedure to treat
                 fractures of the femur bone. Development of such
                 simulators is a complex task which involves multiple
                 systems, technologies, and human experts. Emerging Next
                 Generation Internet technologies were used to develop
                 the standalone on-line haptic-based simulator
                 accessible to the students 24/7. A standalone immersive
                 surgical simulator was also developed using HTC Vive.
                 Expert surgeons played an important role in developing
                 the simulator system; use cases of the target surgical
                 processes were built using a modeling language called
                 the engineering Enterprise Modeling Language (eEML). A
                 detailed study presenting the comparison between the
                 haptic-based simulator and the immersive simulator has
                 been also presented. The outcomes of this study
                 underscore the potential of using such simulators in
                 surgical training.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Dong:2018:LMK,
  author =       "Husheng Dong and Ping Lu and Chunping Liu and Yi Ji
                 and Shengrong Gong",
  title =        "Learning Multiple Kernel Metrics for Iterative Person
                 Re-Identification",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3",
  pages =        "78:1--78:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3234929",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:44 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In person re-identification most metric learning
                 methods learn from training data only once, and then
                 they are deployed for testing. Although impressive
                 performance has been achieved, the discriminative
                 information from successfully identified test samples
                 are ignored. In this work, we present a novel
                 re-identification framework termed Iterative Multiple
                 Kernel Metric Learning (IMKML). Specifically, there are
                 two main modules in IMKML. In the first module,
                 multiple metrics are learned via a new derived Kernel
                 Marginal Nullspace Learning (KMNL) algorithm. Taking
                 advantage of learning a discriminative nullspace from
                 neighborhood manifold, KMNL can well tackle the Small
                 Sample Size (SSS) problem in re-identification distance
                 metric learning. The second module is to construct a
                 pseudo training set by performing re-identification on
                 the testing set. The pseudo training set, which
                 consists of the test image pairs that are highly
                 probable correct matches, is then inserted into the
                 labeled training set to retrain the metrics. By
                 iteratively alternating between the two modules, many
                 more samples will be involved for training and
                 significant performance gains can be achieved.
                 Experiments on four challenging datasets, including
                 VIPeR, PRID450S, CUHK01, and Market-1501, show that the
                 proposed method performs favorably against the
                 state-of-the-art approaches, especially on the lower
                 ranks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Abdallah:2018:ISI,
  author =       "Maha Abdallah and Kuan-Ta Chen and Carsten Griwodz and
                 Cheng-Hsin Hsu",
  title =        "Introduction to the Special Issue on Delay-Sensitive
                 Video Computing in the Cloud",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "53:1--53:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3214698",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Abdallah:2018:DSV,
  author =       "Maha Abdallah and Carsten Griwodz and Kuan-Ta Chen and
                 Gwendal Simon and Pin-Chun Wang and Cheng-Hsin Hsu",
  title =        "Delay-Sensitive Video Computing in the Cloud: a
                 Survey",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "54:1--54:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3212804",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "While cloud servers provide a tremendous amount of
                 resources for networked video applications, most
                 successful stories of cloud-assisted video applications
                 are presentational video services, such as YouTube and
                 NetFlix. This article surveys the recent advances on
                 delay-sensitive video computations in the cloud, which
                 are crucial to cloud-assisted conversational video
                 services, such as cloud gaming, Virtual Reality (VR),
                 Augmented Reality (AR), and telepresence. Supporting
                 conversational video services with cloud resources is
                 challenging because most cloud servers are far away
                 from the end users while these services incur the
                 following stringent requirements: high bandwidth, short
                 delay, and high heterogeneity. In this article, we
                 cover the literature with a top-down approach: from
                 applications and experience, to architecture and
                 management, and to optimization in and outside of the
                 cloud. We also point out major open challenges, hoping
                 to stimulate more research activities in this emerging
                 and exciting direction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2018:CES,
  author =       "Yusen Li and Yunhua Deng and Xueyan Tang and Wentong
                 Cai and Xiaoguang Liu and Gang Wang",
  title =        "Cost-Efficient Server Provisioning for Cloud Gaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "55:1--55:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190838",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Cloud gaming has gained significant popularity
                 recently due to many important benefits such as removal
                 of device constraints, instant-on, and cross-platform.
                 The properties of intensive resource demands and
                 dynamic workloads make cloud gaming appropriate to be
                 supported by an elastic cloud platform. Facing a large
                 user population, a fundamental problem is how to
                 provide satisfactory cloud gaming service at modest
                 cost. We observe that the software storage cost could
                 be substantial compared to the server running cost in
                 cloud gaming using elastic cloud resources. Therefore,
                 in this article, we address the server provisioning
                 problem for cloud gaming to optimize both the server
                 running cost and the software storage cost. We find
                 that the distribution of game software among servers
                 and the selection of server types both trigger
                 tradeoffs between the software storage cost and the
                 server running cost in cloud gaming. We formulate the
                 problem with a stochastic model and employ queueing
                 theory to conduct a solid theoretical analysis of the
                 system behaviors under different request dispatching
                 policies. We then propose several classes of algorithms
                 to approximate the optimal solution. The proposed
                 algorithms are evaluated by extensive experiments using
                 real-world parameters. The results show that the
                 proposed Ordered and Genetic algorithms are
                 computationally efficient, nearly cost-optimal, and
                 highly robust to dynamic changes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Slivar:2018:GCD,
  author =       "Ivan Slivar and Mirko Suznjevic and Lea Skorin-Kapov",
  title =        "Game Categorization for Deriving {QoE}-Driven Video
                 Encoding Configuration Strategies for Cloud Gaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "56:1--56:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132041",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Cloud gaming has been recognized as a promising shift
                 in the online game industry, with the aim of
                 implementing the ``on demand'' service concept that has
                 achieved market success in other areas of digital
                 entertainment such as movies and TV shows. The concepts
                 of cloud computing are leveraged to render the game
                 scene as a video stream that is then delivered to
                 players in real-time. The main advantage of this
                 approach is the capability of delivering high-quality
                 graphics games to any type of end user device; however,
                 at the cost of high bandwidth consumption and strict
                 latency requirements. A key challenge faced by cloud
                 game providers lies in configuring the video encoding
                 parameters so as to maximize player Quality of
                 Experience (QoE) while meeting bandwidth availability
                 constraints. In this article, we tackle one aspect of
                 this problem by addressing the following research
                 question: Is it possible to improve service adaptation
                 based on information about the characteristics of the
                 game being streamed? To answer this question, two main
                 challenges need to be addressed: the need for different
                 QoE-driven video encoding (re-)configuration strategies
                 for different categories of games, and how to determine
                 a relevant game categorization to be used for assigning
                 appropriate configuration strategies. We investigate
                 these problems by conducting two subjective laboratory
                 studies with a total of 80 players and three different
                 games. Results indicate that different strategies
                 should likely be applied for different types of games,
                 and show that existing game classifications are not
                 necessarily suitable for differentiating game types in
                 this context. We thus further analyze objective video
                 metrics of collected game play video traces as well as
                 player actions per minute and use this as input data
                 for clustering of games into two clusters. Subjective
                 results verify that different video encoding
                 configuration strategies may be applied to games
                 belonging to different clusters.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Claypool:2018:GID,
  author =       "Mark Claypool",
  title =        "Game Input with Delay-Moving Target Selection with a
                 Game Controller Thumbstick",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "57:1--57:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3187288",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Hosting interactive video-based services, such as
                 computer games, in the Cloud poses particular
                 challenges given user sensitivity to delay. A better
                 understanding of the impact of delay on player-game
                 interactions can help design cloud systems and games
                 that accommodate delays inherent in cloud systems.
                 Previous top-down studies of delay using full-featured
                 games have helped understand the impact of delay, but
                 often do not generalize or lend themselves to analytic
                 modeling. Bottom-up studies isolating user input and
                 delay can better generalize and be used in models, but
                 have yet to be applied to cloud-hosted computer games.
                 In order to better understand delay impact in
                 cloud-hosted computer games, we conduct a large
                 bottom-up user study centered on a fundamental game
                 interaction-selecting a moving target with user input
                 impeded by delay. Our work builds a custom game that
                 controls both the target speed and input delay and has
                 players select the target using a game controller
                 analog thumbstick. Analysis of data from over 50 users
                 shows target selection time exponentially increases
                 with delay and target speed and is well-fit by an
                 exponential model that includes a delay and target
                 speed interaction term. A comparison with two previous
                 studies, both using a mouse instead of a thumbstick,
                 suggests the model's relationship between selection
                 time, delay, and target speed holds more broadly,
                 providing a foundation for a potential law explaining
                 moving target selection with delay encountered in
                 cloud-hosted games.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hou:2018:NHC,
  author =       "Xueshi Hou and Yao Lu and Sujit Dey",
  title =        "Novel Hybrid-Cast Approach to Reduce Bandwidth and
                 Latency for Cloud-Based Virtual Space",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "58:1--58:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3205864",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, we explore the possibility of
                 enabling cloud-based virtual space applications for
                 better computational scalability and easy access from
                 any end device, including future lightweight wireless
                 head-mounted displays. In particular, we investigate
                 virtual space applications such as virtual classroom
                 and virtual gallery, in which the scenes and activities
                 are rendered in the cloud, with multiple views captured
                 and streamed to each end device. A key challenge is the
                 high bandwidth requirement to stream all the user
                 views, leading to high operational cost and potential
                 large delay in a bandwidth-restricted wireless network.
                 We propose a novel hybrid-cast approach to save
                 bandwidth in a multi-user streaming scenario. We
                 identify and broadcast the common pixels shared by
                 multiple users, while unicasting the residual pixels
                 for each user. We formulate the problem of minimizing
                 the total bitrate needed to transmit the user views
                 using hybrid-casting and describe our approach. A
                 common view extraction approach and a smart grouping
                 algorithm are proposed and developed to achieve our
                 hybrid-cast approach. Simulation results show that the
                 hybrid-cast approach can significantly reduce total
                 bitrate by up to 55\% and avoid congestion-related
                 latency, compared to traditional cloud-based approach
                 of transmitting all the views as individual unicast
                 streams, hence addressing the bandwidth challenges of
                 the cloud, with additional benefits in cost and
                 delay.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2018:CBC,
  author =       "Chang Liu and Wei Tsang Ooi and Jinyuan Jia and Lei
                 Zhao",
  title =        "{Cloud Baking}: Collaborative Scene Illumination for
                 Dynamic {Web$3$D} Scenes",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "59:1--59:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3206431",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We propose Cloud Baking, a collaborative rendering
                 architecture for dynamic Web3D scenes. In our
                 architecture, the cloud renderer renders the scene with
                 the global illumination (GI) information in a GI map;
                 the web-based client renderer renders the scene with
                 ambient lighting only and blends it with the GI map
                 received from the cloud for the final scene. This
                 approach allows the users to interact with the web
                 scene and change the scene dynamically through the web
                 interface end, yet move the computationally heavy tasks
                 of global illumination computation to the cloud. A
                 challenge we face is the interaction delay that causes
                 the frames rendered on the cloud and the client to go
                 out of sync. We propose to use 3D warping and a
                 hole-filling algorithm designed for GI map to predict
                 the late GI map. We show both quantitatively and
                 visually the quality of the GI map produced using our
                 method. Our prediction algorithm allows us to further
                 reduce the frequency at which the GI map is computed
                 and sent from the server, reducing both computational
                 needs and bandwidth usage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cesar:2018:BPA,
  author =       "Pablo Cesar and Cheng-Hsin Hsu and Chun-Ying Huang and
                 Pan Hui",
  title =        "Best Papers of the {ACM Multimedia Systems (MMSys)
                 Conference 2017} and the {ACM Workshop on Network and
                 Operating System Support for Digital Audio and Video
                 (NOSSDAV) 2017}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "60:1--60:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3214700",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zahran:2018:AAS,
  author =       "Ahmed H. Zahran and Jason J. Quinlan and K. K.
                 Ramakrishnan and Cormac J. Sreenan",
  title =        "{ASAP}: Adaptive Stall-Aware Pacing for Improved
                 {DASH} Video Experience in Cellular Networks",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "61:1--61:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3219750",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The dramatic growth of video traffic represents a
                 practical challenge for cellular network operators in
                 providing a consistent streaming Quality of Experience
                 (QoE) to their users. Satisfying this objective has
                 so-far proved elusive, due to the inherent
                 characteristics of wireless networks and varying
                 channel conditions as well as variability in the video
                 bitrate that can degrade streaming performance. In this
                 article, we propose stall-aware pacing as a novel MPEG
                 DASH video traffic management solution that reduces
                 playback stalls and seeks to maintain a consistent QoE
                 for cellular users, even those with diverse channel
                 conditions. These goals are achieved by leveraging both
                 network and client state information to optimize the
                 pacing of individual video flows. We evaluate the
                 performance of two versions of stall-aware pacing
                 techniques extensively, including stall-aware pacing
                 (SAP) and adaptive stall-aware pacing (ASAP), using
                 real video content and clients, operating over a
                 simulated LTE network. We implement state-of-the-art
                 client adaptation and traffic management strategies for
                 direct comparisons with SAP and ASAP. Our results,
                 using a heavily loaded base station, show that SAP
                 reduces the number of stalls and the average stall
                 duration per session by up to 95\%. Additionally, SAP
                 ensures that clients with good channel conditions do
                 not dominate available wireless resources, evidenced by
                 a reduction of up to 40\% in the standard deviation of
                 the QoE metric across clients. We also show that ASAP
                 achieves additional performance gains by adaptively
                 pacing video streams based on the application buffer
                 state.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhou:2018:EOP,
  author =       "Chao Zhou and Zhenhua Li and Joe Osgood and Yao Liu",
  title =        "On the Effectiveness of Offset Projections for $
                 360$-Degree Video Streaming",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "62:1--62:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209660",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "A new generation of video streaming technology,
                 360-degree video, promises greater immersiveness than
                 standard video streams. This level of immersiveness is
                 similar to that produced by virtual reality
                 devices-users can control the field of view using head
                 movements rather than needing to manipulate external
                 devices. Although 360-degree video could revolutionize
                 the streaming experience, its large-scale adoption is
                 hindered by a number of factors: 360-degree video
                 streams have larger bandwidth requirements and require
                 faster responsiveness to user inputs, and users may be
                 more sensitive to lower quality streams. In this
                 article, we review standard approaches toward
                 360-degree video encoding and compare these to families
                 of approaches that distort the spherical surface to
                 allow oriented concentrations of the 360-degree view.
                 We refer to these distorted projections as offset
                 projections. Our measurement studies show that most
                 types of offset projections produce rendered views with
                 better quality than their nonoffset equivalents when
                 view orientations are within 40 or 50 degrees of the
                 offset orientation. Offset projections complicate
                 adaptive 360-degree video streaming because they
                 require a combination of bitrate and view orientation
                 adaptations. We estimate that this combination of
                 streaming adaptation in two dimensions can cause over
                 57\% extra segments to be downloaded compared to an
                 ideal downloading strategy, wasting 20\% of the total
                 downloading bandwidth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bahirat:2018:DEM,
  author =       "Kanchan Bahirat and Chengyuan Lai and Ryan P. Mcmahan
                 and Balakrishnan Prabhakaran",
  title =        "Designing and Evaluating a Mesh Simplification
                 Algorithm for Virtual Reality",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "63:1--63:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209661",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "With the increasing accessibility of the mobile
                 head-mounted displays (HMDs), mobile virtual reality
                 (VR) systems are finding applications in various areas.
                 However, mobile HMDs are highly constrained with
                 limited graphics processing units (GPUs) and low
                 processing power and onboard memory. Hence, VR
                 developers must be cognizant of the number of polygons
                 contained within their virtual environments to avoid
                 rendering at low frame rates and inducing simulator
                 sickness. The most robust and rapid approach to keeping
                 the overall number of polygons low is to use mesh
                 simplification algorithms to create low-poly versions
                 of pre-existing, high-poly models. Unfortunately, most
                 existing mesh simplification algorithms cannot
                 adequately handle meshes with lots of boundaries or
                 nonmanifold meshes, which are common attributes of many
                 3D models. In this article, we present QEM$_{4VR}$, a
                 high-fidelity mesh simplification algorithm
                 specifically designed for VR. This algorithm addresses
                 the deficiencies of prior quadric error metric (QEM)
                 approaches by leveraging the insight that the most
                 relevant boundary edges lie along curvatures while
                 linear boundary edges can be collapsed. Additionally,
                 our algorithm preserves key surface properties, such as
                 normals, texture coordinates, colors, and materials, as
                 it preprocesses 3D models and generates their low-poly
                 approximations offline. We evaluated the effectiveness
                 of our QEM$_{4VR}$ algorithm by comparing its
                 simplified-mesh results to those of prior QEM
                 variations in terms of geometric approximation error,
                 texture error, progressive approximation errors, frame
                 rate impact, and perceptual quality measures. We found
                 that QEM$_{4VR}$ consistently yielded simplified meshes
                 with less geometric approximation error and texture
                 error than the prior QEM variations. It afforded better
                 frame rates than QEM variations with boundary
                 preservation constraints that create unnecessary lower
                 bounds on overall polygon count reduction. Our
                 evaluation revealed that QEM$_{4VR}$ did not fair well
                 in terms of existing perceptual distance measurements,
                 but human-based inspections demonstrate that these
                 algorithmic measurements are not suitable substitutes
                 for actual human perception. In turn, we present a
                 user-based methodology for evaluating the perceptual
                 qualities of mesh simplification algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2018:ELV,
  author =       "Junjue Wang and Brandon Amos and Anupam Das and
                 Padmanabhan Pillai and Norman Sadeh and Mahadev
                 Satyanarayanan",
  title =        "Enabling Live Video Analytics with a Scalable and
                 Privacy-Aware Framework",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "64:1--64:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209659",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "We show how to build the components of a
                 privacy-aware, live video analytics ecosystem from the
                 bottom up, starting with OpenFace, our new open-source
                 face recognition system that approaches
                 state-of-the-art accuracy. Integrating OpenFace with
                 interframe tracking, we build RTFace, a mechanism for
                 denaturing video streams that selectively blurs faces
                 according to specified policies at full frame rates.
                 This enables privacy management for live video
                 analytics while providing a secure approach for
                 handling retrospective policy exceptions. Finally, we
                 present a scalable, privacy-aware architecture for
                 large camera networks using RTFace and show how it can
                 be an enabler for a vibrant ecosystem and marketplace
                 of privacy-aware video streams and analytics
                 services.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gudmundsson:2018:PWS,
  author =       "Gylfi {\Thorn}{\'o}r Gudmundsson and Bj{\"o}rn
                 {\Thorn}{\'o}r J{\'o}nsson and Laurent Amsaleg and
                 Michael J. Franklin",
  title =        "Prototyping a {Web}-Scale Multimedia Retrieval Service
                 Using {Spark}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "65:1--65:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209662",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The world has experienced phenomenal growth in data
                 production and storage in recent years, much of which
                 has taken the form of media files. At the same time,
                 computing power has become abundant with multi-core
                 machines, grids, and clouds. Yet it remains a challenge
                 to harness the available power and move toward
                 gracefully searching and retrieving from web-scale
                 media collections. Several researchers have
                 experimented with using automatically distributed
                 computing frameworks, notably Hadoop and Spark, for
                 processing multimedia material, but mostly using small
                 collections on small computing clusters. In this
                 article, we describe a prototype of a (near) web-scale
                 throughput-oriented MM retrieval service using the
                 Spark framework running on the AWS cloud service. We
                 present retrieval results using up to 43 billion SIFT
                 feature vectors from the public YFCC 100M collection,
                 making this the largest high-dimensional feature vector
                 collection reported in the literature. We also present
                 a publicly available demonstration retrieval system,
                 running on our own servers, where the implementation of
                 the Spark pipelines can be observed in practice using
                 standard image benchmarks, and downloaded for research
                 purposes. Finally, we describe a method to evaluate
                 retrieval quality of the ever-growing high-dimensional
                 index of the prototype, without actually indexing a
                 web-scale media collection.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ma:2018:CUB,
  author =       "Ming Ma and Lei Zhang and Jiangchuan Liu and Zhi Wang
                 and Haitian Pang and Lifeng Sun and Weihua Li and
                 Guangling Hou and Kaiyan Chu",
  title =        "Characterizing User Behaviors in Mobile Personal
                 Livecast: Towards an Edge Computing-assisted Paradigm",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "66:1--66:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3219751",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Mobile personal livecast (MPL) services are emerging
                 and have received great attention recently. In MPL,
                 numerous and geo-distributed ordinary people broadcast
                 their video contents to worldwide viewers. Different
                 from conventional social networking services like
                 Twitter and Facebook, which have a tolerance for
                 interaction delay, the interactions (e.g., chat
                 messages) in a personal livecast must be in real-time
                 with low feedback latency. These unique characteristics
                 inspire us to: (1) investigate how the relationships
                 (e.g., social links and geo-locations) between viewers
                 and broadcasters influence the user behaviors, which
                 has yet to be explored in depth; and (2) explore
                 insights to benefit the improvement of system
                 performance. In this article, we carry out extensive
                 measurements of a representative MPL system, with a
                 large-scale dataset containing 11M users. In the
                 current costly and limited cloud-based MPL system,
                 which is faced with scalability problem, we find: (1)
                 the long content uploading distances between
                 broadcasters and cloud ingesting servers result in an
                 impaired system QoS, including a high broadcast latency
                 and a frequently buffering events; and (2) most of the
                 broadcasters in MPL are geographically locally popular
                 (the majority of the views come from the same region of
                 the broadcaster), which consume vast computation and
                 bandwidth resources of the clouds and Content Delivery
                 Networks. Fortunately, the emergence of edge computing,
                 which provides cloud-computing capabilities at the edge
                 of the mobile network, naturally sheds new light on the
                 MPL system; i.e., localized ingesting, transcoding, and
                 delivering locally popular live content is possible.
                 Based on these critical observations, we propose an
                 edge-assisted MPL system that collaboratively utilizes
                 the core-cloud and abundant edge computing resources to
                 improve the system efficiency and scalability. In our
                 framework, we consider a dynamic broadcaster assignment
                 to minimize the broadcast latency while keeping the
                 resource lease cost low. We formulate the broadcaster
                 scheduling as a stable matching with migration problem
                 to solve it effectively. Compared with the current pure
                 cloud-based system, our edge-assisted delivery approach
                 reduces the broadcast latency by about 35\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Huang:2018:UBA,
  author =       "Lei Huang and Bowen Ding and Aining Wang and Yuedong
                 Xu and Yipeng Zhou and Xiang Li",
  title =        "User Behavior Analysis and Video Popularity Prediction
                 on a Large-Scale {VoD} System",
  journal =      j-TOMM,
  volume =       "14",
  number =       "3s",
  pages =        "67:1--67:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226035",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Understanding streaming user behavior is crucial to
                 the design of large-scale Video-on-Demand (VoD)
                 systems. In this article, we begin with the measurement
                 of individual viewing behavior from two aspects: the
                 temporal characteristics and user interest. We observe
                 that active users spend more hours on each active day,
                 and their daily request time distribution is more
                 scattered than that of the less active users, while the
                 inter-view time distribution differs negligibly between
                 two groups. The common interest in popular videos and
                 the latest uploaded videos is observed in both groups.
                 We then investigate the predictability of video
                 popularity as a collective user behavior through early
                 views. In the light of the limitations of classical
                 approaches, the Autoregressive-Moving-Average (ARMA)
                 model is employed to forecast the popularity dynamics
                 of individual videos at fine-grained time scales, thus
                 achieving much higher prediction accuracy. When applied
                 to video caching, the ARMA-assisted Least Frequently
                 Used (LFU) algorithm can outperform the Least Recently
                 Used (LRU) by 11--16\%, the well-tuned LFU by 6--13\%,
                 and the LFU is only 2--4\% inferior to the offline LFU
                 in terms of hit ratio.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2018:JHA,
  author =       "Junfeng Zhang and Haifeng Hu",
  title =        "Joint Head Attribute Classifier and Domain-Specific
                 Refinement Networks for Face Alignment",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "79:1--79:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241059",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "In this article, a two-stage refinement network is
                 proposed for facial landmarks detection on
                 unconstrained conditions. Our model can be divided into
                 two modules, namely the Head Attribude Classifier (HAC)
                 module and the Domain-Specific Refinement (DSR) module.
                 Given an input facial image, HAC adopts multi-task
                 learning mechanism to detect the head pose and obtain
                 an initial shape. Based on the obtained head pose, DSR
                 designs three different CNN-based refinement networks
                 trained by specific domain, respectively, and
                 automatically selects the most approximate network for
                 the landmarks refinement. Different from existing
                 two-stage models, HAC combines head pose prediction
                 with facial landmarks estimation to improve the
                 accuracy of head pose prediction, as well as obtaining
                 a robust initial shape. Moreover, an adaptive
                 sub-network training strategy applied in the DSR module
                 can effectively solve the issue of traditional
                 multi-view methods that an improperly selected
                 sub-network may result in alignment failure. The
                 extensive experimental results on two public datasets,
                 AFLW and 300W, confirm the validity of our model.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{PascottiValem:2018:USL,
  author =       "Lucas {Pascotti Valem} and Carlos {Renan De Oliveira}
                 and Daniel Carlos {Guimar{\~a}es Pedronette} and
                 Jurandy Almeida",
  title =        "Unsupervised Similarity Learning through Rank
                 Correlation and {kNN} Sets",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "80:1--80:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241053",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The increasing amount of multimedia data collections
                 available today evinces the pressing need for methods
                 capable of indexing and retrieving this content.
                 Despite the continuous advances in multimedia features
                 and representation models, to establish an effective
                 measure for comparing different multimedia objects
                 still remains a challenging task. While supervised and
                 semi-supervised techniques made relevant advances on
                 similarity learning tasks, scenarios where labeled data
                 are non-existent require different strategies. In such
                 situations, unsupervised learning has been established
                 as a promising solution, capable of considering the
                 contextual information and the dataset structure for
                 computing new similarity/dissimilarity measures. This
                 article extends a recent unsupervised learning
                 algorithm that uses an iterative re-ranking strategy to
                 take advantage of different k -Nearest Neighbors (kNN)
                 sets and rank correlation measures. Two novel
                 approaches are proposed for computing the kNN sets and
                 their corresponding top- k lists. The proposed
                 approaches were validated in conjunction with various
                 rank correlation measures, yielding superior
                 effectiveness results in comparison with previous
                 works. In addition, we also evaluate the ability of the
                 method in considering different multimedia objects,
                 conducting an extensive experimental evaluation on
                 various image and video datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2018:TLD,
  author =       "Hui-Yin Wu and Francesca Pal{\`u} and Roberto Ranon
                 and Marc Christie",
  title =        "Thinking Like a Director: Film Editing Patterns for
                 Virtual Cinematographic Storytelling",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "81:1--81:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241057",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article introduces Film Editing Patterns (FEP), a
                 language to formalize film editing practices and
                 stylistic choices found in movies. FEP constructs are
                 constraints, expressed over one or more shots from a
                 movie sequence, that characterize changes in
                 cinematographic visual properties, such as shot sizes,
                 camera angles, or layout of actors on the screen. We
                 present the vocabulary of the FEP language, introduce
                 its usage in analyzing styles from annotated film data,
                 and describe how it can support users in the creative
                 design of film sequences in 3D. More specifically, (i)
                 we define the FEP language, (ii) we present an
                 application to craft filmic sequences from 3D animated
                 scenes that uses FEPs as a high level mean to select
                 cameras and perform cuts between cameras that follow
                 best practices in cinema, and (iii) we evaluate the
                 benefits of FEPs by performing user experiments in
                 which professional filmmakers and amateurs had to
                 create cinematographic sequences. The evaluation
                 suggests that users generally appreciate the idea of
                 FEPs, and that it can effectively help novice and
                 medium experienced users in crafting film sequences
                 with little training.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yu:2018:SPI,
  author =       "Tuo Yu and Haiming Jin and Wai-Tian Tan and Klara
                 Nahrstedt",
  title =        "{SKEPRID}: Pose and Illumination Change-Resistant
                 Skeleton-Based Person Re-Identification",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "82:1--82:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243217",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Currently, the surveillance camera-based person
                 re-identification is still challenging because of
                 diverse factors such as people's changing poses and
                 various illumination. The various poses make it hard to
                 conduct feature matching across images, and the
                 illumination changes make color-based features
                 unreliable. In this article, we present SKEPRID,$^1$ a
                 skeleton-based person re-identification method that
                 handles strong pose and illumination changes jointly.
                 To reduce the impacts of pose changes on
                 re-identification, we estimate the joints' positions of
                 a person based on the deep learning technique and thus
                 make it possible to extract features on specific body
                 parts with high accuracy. Based on the skeleton
                 information, we design a set of local color
                 comparison-based cloth-type features, which are
                 resistant to various lighting conditions. Moreover, to
                 better evaluate SKEPRID, we build the PO8LI$^2$
                 dataset, which has large pose and illumination
                 diversity. Our experimental results show that SKEPRID
                 outperforms state-of-the-art approaches in the case of
                 strong pose and illumination variation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Fan:2018:UPR,
  author =       "Hehe Fan and Liang Zheng and Chenggang Yan and Yi
                 Yang",
  title =        "Unsupervised Person Re-identification: Clustering and
                 Fine-tuning",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "83:1--83:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243316",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "The superiority of deeply learned pedestrian
                 representations has been reported in very recent
                 literature of person re-identification (re-ID). In this
                 article, we consider the more pragmatic issue of
                 learning a deep feature with no or only a few labels.
                 We propose a progressive unsupervised learning (PUL)
                 method to transfer pretrained deep representations to
                 unseen domains. Our method is easy to implement and can
                 be viewed as an effective baseline for unsupervised
                 re-ID feature learning. Specifically, PUL iterates
                 between (1) pedestrian clustering and (2) fine-tuning
                 of the convolutional neural network (CNN) to improve
                 the initialization model trained on the irrelevant
                 labeled dataset. Since the clustering results can be
                 very noisy, we add a selection operation between the
                 clustering and fine-tuning. At the beginning, when the
                 model is weak, CNN is fine-tuned on a small amount of
                 reliable examples that locate near to cluster centroids
                 in the feature space. As the model becomes stronger, in
                 subsequent iterations, more images are being adaptively
                 selected as CNN training samples. Progressively,
                 pedestrian clustering and the CNN model are improved
                 simultaneously until algorithm convergence. This
                 process is naturally formulated as self-paced learning.
                 We then point out promising directions that may lead to
                 further improvement. Extensive experiments on three
                 large-scale re-ID datasets demonstrate that PUL outputs
                 discriminative features that improve the re-ID
                 accuracy. Our code has been released at
                 https://github.com/hehefan/Unsupervised-Person-Re-identification-Clustering-and-Fine-tuning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lin:2018:REN,
  author =       "Xiaodan Lin and Xiangui Kang",
  title =        "Robust Electric Network Frequency Estimation with Rank
                 Reduction and Linear Prediction",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "84:1--84:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241058",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "This article deals with the problem of Electric
                 Network Frequency (ENF) estimation where Signal to
                 Noise Ratio (SNR) is an essential challenge. By
                 exploiting the low-rank structure of the ENF signal
                 from the audio spectrogram, we propose an approach
                 based on robust principle component analysis to get rid
                 of the interference from speech contents and some of
                 the background noise, which in our case can be regarded
                 as sparse in nature. Weighted linear prediction is
                 enforced on the low-rank signal subspace to gain
                 accurate ENF estimation. The performance of the
                 proposed scheme is analyzed and evaluated as a function
                 of SNR, and the Cram{\'e}r-Rao Lower Bound (CRLB) is
                 approached at an SNR level above -10 dB. Experiments on
                 real datasets have demonstrated the advantages of the
                 proposed method over state-of-the-art work in terms of
                 estimation accuracy. Specifically, the proposed scheme
                 can effectively capture the ENF fluctuations along the
                 time axis using small numbers of signal observations
                 while preserving sufficient frequency precision.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2018:PMB,
  author =       "Yue Li and Gaobo Yang and Yapei Zhu and Xiangling Ding
                 and Rongrong Gong",
  title =        "Probability Model-Based Early Merge Mode Decision for
                 Dependent Views Coding in {$3$D-HEVC}",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "85:1--85:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3267128",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "As a 3D extension to the High Efficiency Video Coding
                 (HEVC) standard, 3D-HEVC was developed to improve the
                 coding efficiency of multiview videos. It inherits the
                 prediction modes from HEVC, yet both Motion Estimation
                 (ME) and Disparity Estimation (DE) are required for
                 dependent views coding. This improves coding efficiency
                 at the cost of huge computational costs. In this
                 article, an early Merge mode decision approach is
                 proposed for dependent texture views and dependent
                 depth maps coding in 3D-HEVC based on priori and
                 posterior probability models. First, the priori
                 probability model is established by exploiting the
                 hierarchical and interview correlations from those
                 previously encoded blocks. Second, the posterior
                 probability model is built by using the Coded Block
                 Flag (CBF) of the current coding block. Finally, the
                 joint priori and posterior probability model is adopted
                 to early terminate the Merge mode decision for both
                 dependent texture views and dependent depth maps
                 coding. Experimental results show that the proposed
                 approach saves 45.2\% and 30.6\% encoding time on
                 average for dependent texture views and dependent depth
                 maps coding while maintaining negligible loss of coding
                 efficiency, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Santos:2018:HAS,
  author =       "Joel A. F. {Dos Santos} and D{\'e}bora C.
                 Muchaluat-Saade and C{\'e}cile Roisin and Nabil
                 Laya{\"\i}da",
  title =        "A Hybrid Approach for Spatio-Temporal Validation of
                 Declarative Multimedia Documents",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "86:1--86:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3267127",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Declarative multimedia documents represent the
                 description of multimedia applications in terms of
                 media items and relationships among them. Relationships
                 specify how media items are dynamically arranged in
                 time and space during runtime. Although a declarative
                 approach usually facilitates the authoring task,
                 authors can still make mistakes due to incorrect use of
                 language constructs or inconsistent or missing
                 relationships in a document. In order to properly
                 support multimedia application authoring, it is
                 important to provide tools with validation
                 capabilities. Document validation can indicate possible
                 inconsistencies in a given document to an author so
                 that it can be revised before deployment. Although very
                 useful, multimedia validation tools are not often
                 provided by authoring tools. This work proposes a
                 multimedia validation approach that relies on a formal
                 model called Simple Hypermedia Model (SHM). SHM is used
                 for representing a document for the purpose of
                 validation. An SHM document is validated using a hybrid
                 approach based on two complementary techniques. The
                 first one captures the document's spatio-temporal
                 layout in terms of its state throughout its execution
                 by means of a rewrite theory, and validation is
                 performed through model-checking. The second one
                 captures the document's layout in terms of intervals
                 and event occurrences by means of Satisfiability Modulo
                 Theories (SMT) formulas, and validation is performed
                 through SMT solving. Due to different characteristics
                 of both approaches, each validation technique
                 complements the other in terms of expressiveness of SHM
                 and tests to be checked. We briefly present validation
                 tools that use our approach. They were evaluated with
                 real NCL documents and by usability tests.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2018:ICS,
  author =       "Jie Wu and Haifeng Hu and Yi Wu",
  title =        "Image Captioning via Semantic Guidance Attention and
                 Consensus Selection Strategy",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "87:1--87:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3271485",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Recently, a series of attempts have incorporated
                 spatial attention mechanisms into the task of image
                 captioning, which achieves a remarkable improvement in
                 the quality of generative captions. However, the
                 traditional spatial attention mechanism adopts latent
                 and delayed semantic representations to decide which
                 area should be paid more attention to, resulting in
                 inaccurate semantic guidance and the introduction of
                 redundant information. In order to optimize the spatial
                 attention mechanism, we propose the Semantic Guidance
                 Attention (SGA) mechanism in this article.
                 Specifically, SGA utilizes semantic word
                 representations to provide an intuitive semantic
                 guidance that focuses accurately on semantic-related
                 regions. Moreover, we reduce the difficulty of
                 generating fluent sentences by updating the attention
                 information in time. At the same time, the beam search
                 algorithm is widely used to predict words during
                 sequence generation. This algorithm generates a
                 sentence according to the probabilities of words, so it
                 is easy to push out a generic sentence and discard some
                 distinctive captions. In order to overcome this
                 limitation, we design the Consensus Selection (CS)
                 strategy to choose the most descriptive and informative
                 caption, which is selected by the semantic similarity
                 of captions instead of the probabilities of words. The
                 consensus caption is determined by selecting the one
                 with the highest cumulative semantic similarity with
                 respect to the reference captions. Our proposed model
                 (SGA-CS) is validated on Flickr30k and MSCOCO, which
                 shows that SGA-CS outperforms state-of-the-art
                 approaches. To our best knowledge, SGA-CS is the first
                 attempt to jointly produce semantic attention guidance
                 and select descriptive captions for image captioning
                 tasks, achieving one of the best performance ratings
                 among any cross-entropy training methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Strezoski:2018:OLS,
  author =       "Gjorgji Strezoski and Marcel Worring",
  title =        "{OmniArt}: a Large-scale Artistic Benchmark",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "88:1--88:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3273022",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "Baselines are the starting point of any quantitative
                 multimedia research, and benchmarks are essential for
                 pushing those baselines further. In this article, we
                 present baselines for the artistic domain with a new
                 benchmark dataset featuring over 2 million images with
                 rich structured metadata dubbed OmniArt. OmniArt
                 contains annotations for dozens of attribute types and
                 features semantic context information through concepts,
                 IconClass labels, color information, and (limited)
                 object-level bounding boxes. For our dataset we
                 establish and present baseline scores on multiple tasks
                 such as artist attribution, creation period estimation,
                 type, style, and school prediction. In addition to our
                 metadata related experiments, we explore the color
                 spaces of art through different types and evaluate a
                 transfer learning object recognition pipeline.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Koch:2018:CYU,
  author =       "Christian Koch and Moritz Lode and Denny Stohr and Amr
                 Rizk and Ralf Steinmetz",
  title =        "Collaborations on {YouTube}: From Unsupervised
                 Detection to the Impact on Video and Channel
                 Popularity",
  journal =      j-TOMM,
  volume =       "14",
  number =       "4",
  pages =        "89:1--89:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241054",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:45 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  abstract =     "YouTube is the most popular platform for streaming of
                 user-generated videos. Nowadays, professional YouTubers
                 are organized in so-called multichannel networks
                 (MCNs). These networks offer services such as brand
                 deals, equipment, and strategic advice in exchange for
                 a share of the YouTubers' revenues. A dominant strategy
                 to gain more subscribers and, hence, revenue is
                 collaborating with other YouTubers. Yet, collaborations
                 on YouTube have not been studied in a detailed
                 quantitative manner. To close this gap, first, we
                 collect a YouTube dataset covering video statistics
                 over 3 months for 7,942 channels. Second, we design a
                 framework for collaboration detection given a
                 previously unknown number of persons featured in
                 YouTube videos. We denote this framework, for the
                 detection and analysis of collaborations in YouTube
                 videos using a Deep Neural Network (DNN)-based
                 approach, as CATANA. Third, we analyze about 2.4 years
                 of video content and use CATANA to answer research
                 questions guiding YouTubers and MCNs for efficient
                 collaboration strategies. Thereby, we focus on (1)
                 collaboration frequency and partner selectivity, (2)
                 the influence of MCNs on channel collaborations, (3)
                 collaborating channel types, and (4) the impact of
                 collaborations on video and channel popularity. Our
                 results show that collaborations are in many cases
                 significantly beneficial regarding viewers and newly
                 attracted subscribers for both collaborating channels,
                 often showing more than 100\% popularity growth
                 compared with noncollaboration videos.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2019:EQA,
  author =       "Wei Zhang",
  title =        "Efficient {QoE}-Aware Scheme for Video Quality
                 Switching Operations in Dynamic Adaptive Streaming",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "17:1--17:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3269494",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3269494",
  abstract =     "Dynamic Adaptive Streaming over HTTP (DASH) is a
                 popular over-the-top video content distribution
                 technique that adapts the streaming session according
                 to the user's network condition typically in terms of
                 downlink bandwidth. This video quality adaptation can
                 be achieved by scaling the frame quality, spatial
                 resolution or frame rate. Despite the flexibility on
                 the video quality scaling methods, each of these
                 quality scaling dimensions has varying effects on the
                 Quality of Experience (QoE) for end users. Furthermore,
                 in video streaming, the changes in motion over time
                 along with the scaling method employed have an
                 influence on QoE, hence the need to carefully tailor
                 scaling methods to suit streaming applications and
                 content type. In this work, we investigate an
                 intelligent DASH approach for the latest video coding
                 standard H.265 and propose a heuristic QoE-aware
                 cost-efficient adaptation scheme that does not switch
                 unnecessarily to the highest quality level but rather
                 stays temporarily at an intermediate quality level in
                 certain streaming scenarios. Such an approach achieves
                 a comparable and consistent level of quality under
                 impaired network conditions as commonly found in
                 Internet and mobile networks while reducing bandwidth
                 requirements and quality switching overhead. The
                 rationale is based on our empirical experiments, which
                 show that an increase in bitrate does not necessarily
                 mean noticeable improvement in QoE. Furthermore, our
                 work demonstrates that the Signal-to-Noise Ratio (SNR)
                 and the spatial resolution scalability types are the
                 best fit for our proposed algorithm. Finally, we
                 demonstrate an innovative interaction between quality
                 scaling methods and the polarity of switching
                 operations. The proposed QoE-aware scheme is
                 implemented and empirical results show that it is able
                 to reduce bandwidth requirements by up to 41\% whilst
                 achieving equivalent QoE compared with a representative
                 DASH reference implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yahia:2019:HBF,
  author =       "Mariem {Ben Yahia} and Yannick {Le Louedec} and
                 Gwendal Simon and Loutfi Nuaymi and Xavier Corbillon",
  title =        "{HTTP/2}-based Frame Discarding for Low-Latency
                 Adaptive Video Streaming",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3280854",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3280854",
  abstract =     "In this article, we propose video delivery schemes
                 insuring around 1s delivery latency with Dynamic
                 Adaptive Streaming over HTTP (DASH), which is a
                 standard version of HTTP Live Streaming (HLS), so as to
                 benefit from the video representation switching between
                 successive video segments. We also propose HTTP/2-based
                 algorithms to apply video frame discarding policies
                 inside a video segment when a selected DASH
                 representation does not match with the available
                 network resources. The current solutions with small
                 buffer suffer from rebuffering events. Rebuffering not
                 only impacts the Quality of Experience (QoE) but also
                 increases the delivery delay between the displayed and
                 the original video streams. In this work, we completely
                 eliminate rebuffering events by developing optimal and
                 practical video frame discarding algorithms to meet the
                 1s latency constraint. In all our algorithms, we
                 request the video frames individually through HTTP/2
                 multiple streams, and we selectively drop the least
                 meaningful video frames thanks to HTTP/2 stream
                 resetting feature. Our simulations show that the
                 proposed algorithms eliminate rebuffering while
                 insuring an acceptable video quality with at least a
                 Peak Signal to Noise Ratio (PSNR) of 35dB compared to
                 25dB of the basic First In First Out (FIFO) algorithm.
                 We also quantify and qualify the resulting temporal
                 distortion of the video segments per algorithm. An
                 important number of missing video frames results in a
                 temporal fluidity break known as video jitter. The
                 displayed video looks like a series of snapshots. We
                 show that both the optimal Integer Linear Program (ILP)
                 and practical algorithms decrease the frequency and
                 duration of the jitters. For example, practical
                 algorithms reduce the number of crashed displayed
                 videos (presenting one jitter longer than 1,350ms) with
                 22\% compared to the basic FIFO algorithm. We also show
                 that requesting video frames separately with HTTP/2
                 slightly increases the overhead from 4.34\% to
                 5.76\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2019:SRC,
  author =       "Xianguo Li and Yemei Sun and Yanli Yang and Changyun
                 Miao",
  title =        "Symmetrical Residual Connections for Single Image
                 Super-Resolution",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "19:1--19:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3282445",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3282445",
  abstract =     "Single-image super-resolution (SISR) methods based on
                 convolutional neural networks (CNN) have shown great
                 potential in the literature. However, most deep CNN
                 models don't have direct access to subsequent layers,
                 seriously hindering the information flow. Furthermore,
                 they fail to make full use of the hierarchical features
                 from different low-level layers, thereby resulting in
                 relatively low accuracy. In this article, we present a
                 new SISR CNN, called SymSR, which incorporates
                 symmetrical nested residual connections to improve both
                 the accuracy and the execution speed. SymSR takes a
                 larger image region for contextual spreading. It
                 symmetrically combines multiple short paths for the
                 forward propagation to improve the accuracy and for the
                 backward propagation of gradient flow to accelerate the
                 convergence speed. Extensive experiments based on open
                 challenge datasets show the effectiveness of
                 symmetrical residual connections. Compared with four
                 other state-of-the-art super-resolution CNN methods,
                 SymSR is superior in both accuracy and runtime.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yu:2019:DCM,
  author =       "Yi Yu and Suhua Tang and Francisco Raposo and Lei
                 Chen",
  title =        "Deep Cross-Modal Correlation Learning for Audio and
                 Lyrics in Music Retrieval",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "20:1--20:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3281746",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3281746",
  abstract =     "Deep cross-modal learning has successfully
                 demonstrated excellent performance in cross-modal
                 multimedia retrieval, with the aim of learning joint
                 representations between different data modalities.
                 Unfortunately, little research focuses on cross-modal
                 correlation learning where temporal structures of
                 different data modalities, such as audio and lyrics,
                 should be taken into account. Stemming from the
                 characteristic of temporal structures of music in
                 nature, we are motivated to learn the deep sequential
                 correlation between audio and lyrics. In this work, we
                 propose a deep cross-modal correlation learning
                 architecture involving two-branch deep neural networks
                 for audio modality and text modality (lyrics). Data in
                 different modalities are converted to the same
                 canonical space where intermodal canonical correlation
                 analysis is utilized as an objective function to
                 calculate the similarity of temporal structures. This
                 is the first study that uses deep architectures for
                 learning the temporal correlation between audio and
                 lyrics. A pretrained Doc2Vec model followed by fully
                 connected layers is used to represent lyrics. Two
                 significant contributions are made in the audio branch,
                 as follows: (i) We propose an end-to-end network to
                 learn cross-modal correlation between audio and lyrics,
                 where feature extraction and correlation learning are
                 simultaneously performed and joint representation is
                 learned by considering temporal structures. (ii) And,
                 as for feature extraction, we further represent an
                 audio signal by a short sequence of local summaries
                 (VGG16 features) and apply a recurrent neural network
                 to compute a compact feature that better learns the
                 temporal structures of music audio. Experimental
                 results, using audio to retrieve lyrics or using lyrics
                 to retrieve audio, verify the effectiveness of the
                 proposed deep correlation learning architectures in
                 cross-modal music retrieval.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Sun:2019:ERF,
  author =       "Jia Sun and Di Huang and Yunhong Wang and Liming
                 Chen",
  title =        "Expression Robust {$3$D} Facial Landmarking via
                 Progressive Coarse-to-Fine Tuning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "21:1--21:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3282833",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3282833",
  abstract =     "Facial landmarking is a fundamental task in automatic
                 machine-based face analysis. The majority of existing
                 techniques for such a problem are based on 2D images;
                 however, they suffer from illumination and pose
                 variations that may largely degrade landmarking
                 performance. The emergence of 3D data theoretically
                 provides an alternative to overcome these weaknesses in
                 the 2D domain. This article proposes a novel approach
                 to 3D facial landmarking, which combines both the
                 advantages of feature-based methods as well as
                 model-based ones in a progressive three-stage
                 coarse-to-fine manner (initial, intermediate, and fine
                 stages). For the initial stage, a few fiducial
                 landmarks (i.e., the nose tip and two inner eye
                 corners) are robustly detected through curvature
                 analysis, and these points are further exploited to
                 initialize the subsequent stage. For the intermediate
                 stage, a statistical model is learned in the feature
                 space of three normal components of the facial
                 point-cloud rather than the smooth original
                 coordinates, namely Active Normal Model (ANM). For the
                 fine stage, cascaded regression is employed to locally
                 refine the landmarks according to their geometry
                 attributes. The proposed approach can accurately
                 localize dozens of fiducial points on each 3D face
                 scan, greatly surpassing the feature-based ones, and it
                 also improves the state of the art of the model-based
                 ones in two aspects: sensitivity to initialization and
                 deficiency in discrimination. The proposed method is
                 evaluated on the BU-3DFE, Bosphorus, and BU-4DFE
                 databases, and competitive results are achieved in
                 comparison with counterparts in the literature, clearly
                 demonstrating its effectiveness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Peng:2019:CGC,
  author =       "Yuxin Peng and Jinwei Qi",
  title =        "{CM-GANs}: Cross-modal Generative Adversarial Networks
                 for Common Representation Learning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "22:1--22:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3284750",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3284750",
  abstract =     "It is known that the inconsistent distributions and
                 representations of different modalities, such as image
                 and text, cause the heterogeneity gap, which makes it
                 very challenging to correlate heterogeneous data and
                 measure their similarities. Recently, generative
                 adversarial networks (GANs) have been proposed and have
                 shown their strong ability to model data distribution
                 and learn discriminative representation. It has also
                 been shown that adversarial learning can be fully
                 exploited to learn discriminative common
                 representations for bridging the heterogeneity gap.
                 Inspired by this, we aim to effectively correlate
                 large-scale heterogeneous data of different modalities
                 with the power of GANs to model cross-modal joint
                 distribution. In this article, we propose Cross-modal
                 Generative Adversarial Networks (CM-GANs) with the
                 following contributions. First, a cross-modal GAN
                 architecture is proposed to model joint distribution
                 over the data of different modalities. The
                 inter-modality and intra-modality correlation can be
                 explored simultaneously in generative and
                 discriminative models. Both compete with each other to
                 promote cross-modal correlation learning. Second, the
                 cross-modal convolutional autoencoders with
                 weight-sharing constraint are proposed to form the
                 generative model. They not only exploit the cross-modal
                 correlation for learning the common representations but
                 also preserve reconstruction information for capturing
                 the semantic consistency within each modality. Third, a
                 cross-modal adversarial training mechanism is proposed,
                 which uses two kinds of discriminative models to
                 simultaneously conduct intra-modality and
                 inter-modality discrimination. They can mutually boost
                 to make the generated common representations more
                 discriminative by the adversarial training process. In
                 summary, our proposed CM-GAN approach can use GANs to
                 perform cross-modal common representation learning by
                 which the heterogeneous data can be effectively
                 correlated. Extensive experiments are conducted to
                 verify the performance of CM-GANs on cross-modal
                 retrieval compared with 13 state-of-the-art methods on
                 4 cross-modal datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pala:2019:RFM,
  author =       "Pietro Pala and Stefano Berretti",
  title =        "Reconstructing {$3$D} Face Models by Incremental
                 Aggregation and Refinement of Depth Frames",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "23:1--23:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3287309",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3287309",
  abstract =     "Face recognition from two-dimensional (2D) still
                 images and videos is quite successful even with ``in
                 the wild'' conditions. Instead, less consolidated
                 results are available for the cases in which face data
                 come from non-conventional cameras, such as infrared or
                 depth. In this article, we investigate this latter
                 scenario assuming that a low-resolution depth camera is
                 used to perform face recognition in an uncooperative
                 context. To this end, we propose, first, to
                 automatically select a set of frames from the depth
                 sequence of the camera because they provide a good view
                 of the face in terms of pose and distance. Then, we
                 design a progressive refinement approach to reconstruct
                 a higher-resolution model from the selected
                 low-resolution frames. This process accounts for the
                 anisotropic error of the existing points in the current
                 3D model and the points in a newly acquired frame so
                 that the refinement step can progressively adjust the
                 point positions in the model using a Kalman-like
                 estimation. The quality of the reconstructed model is
                 evaluated by considering the error between the
                 reconstructed models and their corresponding
                 high-resolution scans used as ground truth. In
                 addition, we performed face recognition using the
                 reconstructed models as probes against a gallery of
                 reconstructed models and a gallery with high-resolution
                 scans. The obtained results confirm the possibility to
                 effectively use the reconstructed models for the face
                 recognition task.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2019:OCT,
  author =       "Han Hu and Yichao Jin and Yonggang Wen and Cedric
                 Westphal",
  title =        "Orchestrating Caching, Transcoding and Request Routing
                 for Adaptive Video Streaming Over {ICN}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "24:1--24:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3289184",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3289184",
  abstract =     "Information-centric networking (ICN) has been touted
                 as a revolutionary solution for the future of the
                 Internet, which will be dominated by video traffic.
                 This work investigates the challenge of distributing
                 video content of adaptive bitrate (ABR) over ICN. In
                 particular, we use the in-network caching capability of
                 ICN routers to serve users; in addition, with the help
                 of named function, we enable ICN routers to transcode
                 videos to lower-bitrate versions to improve the cache
                 hit ratio. Mathematically, we formulate this design
                 challenge into a constrained optimization problem,
                 which aims to maximize the cache hit ratio for service
                 providers and minimize the service delay for endusers.
                 We design a two-step iterative algorithm to find the
                 optimum. First, given a content management scheme, we
                 minimize the service delay via optimally configuring
                 the routing scheme. Second, we maximize the cache hits
                 for a given routing policy. Finally, we rigorously
                 prove its convergence. Through extensive simulations,
                 we verify the convergence and the performance gains
                 over other algorithms. We also find that more resources
                 should be allocated to ICN routers with a heavier
                 request rate, and the routing scheme favors the
                 shortest path to schedule more traffic.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yuan:2019:DLT,
  author =       "Bo Yuan and Xinbo Gao and Zhenxing Niu and Qi Tian",
  title =        "Discovering Latent Topics by {Gaussian} Latent
                 {Dirichlet} Allocation and Spectral Clustering",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "25:1--25:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3290047",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3290047",
  abstract =     "Today, diversifying the retrieval results of a certain
                 query will improve customers' search efficiency.
                 Showing the multiple aspects of information provides
                 users an overview of the object, which helps them fast
                 target their demands. To discover aspects, research
                 focuses on generating image clusters from initially
                 retrieved results. As an effective approach, latent
                 Dirichlet allocation (LDA) has been proved to have good
                 performance on discovering high-level topics. However,
                 traditional LDA is designed to process textual words,
                 and it needs the input as discrete data. When we apply
                 this algorithm to process continuous visual images, a
                 common solution is to quantize the continuous features
                 into discrete form by a bag-of-visual-words algorithm.
                 During this process, quantization error will lead to
                 information that inevitably is lost. To construct a
                 topic model with complete visual information, this work
                 applies Gaussian latent Dirichlet allocation (GLDA) on
                 the diversity issue of image retrieval. In this model,
                 traditional multinomial distribution is substituted
                 with Gaussian distribution to model continuous visual
                 features. In addition, we propose a two-phase spectral
                 clustering strategy, called dual spectral clustering,
                 to generate clusters from region level to image level.
                 The experiments on the challenging landmarks of the
                 DIV400 database show that our proposal improves
                 relevance and diversity by about 10\% compared to
                 traditional topic models.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{He:2019:ICV,
  author =       "Chen He and Haifeng Hu",
  title =        "Image Captioning With Visual-Semantic Double
                 Attention",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "26:1--26:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3292058",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3292058",
  abstract =     "In this article, we propose a novel Visual-Semantic
                 Double Attention (VSDA) model for image captioning. In
                 our approach, VSDA consists of two parts: a modified
                 visual attention model is used to extract sub-region
                 image features, then a new SEmantic Attention (SEA)
                 model is proposed to distill semantic features.
                 Traditional attribute-based models always neglect the
                 distinctive importance of each attribute word and fuse
                 all of them into recurrent neural networks, resulting
                 in abundant irrelevant semantic features. In contrast,
                 at each timestep, our model selects the most relevant
                 word that aligns with current context. In other words,
                 the real power of VSDA lies in the ability of not only
                 leveraging semantic features but also eliminating the
                 influence of irrelevant attribute words to make the
                 semantic guidance more precise. Furthermore, our
                 approach solves the problem that visual attention
                 models cannot boost generating non-visual words.
                 Considering that visual and semantic features are
                 complementary to each other, our model can leverage
                 both of them to strengthen the generations of visual
                 and non-visual words. Extensive experiments are
                 conducted on famous datasets: MS COCO and Flickr30k.
                 The results show that VSDA outperforms other methods
                 and achieves promising performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2019:MII,
  author =       "Ruoyu Liu and Yao Zhao and Shikui Wei and Liang Zheng
                 and Yi Yang",
  title =        "Modality-Invariant Image-Text Embedding for
                 Image-Sentence Matching",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "27:1--27:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300939",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3300939",
  abstract =     "Performing direct matching among different modalities
                 (like image and text) can benefit many tasks in
                 computer vision, multimedia, information retrieval, and
                 information fusion. Most of existing works focus on
                 class-level image-text matching, called cross-modal
                 retrieval, which attempts to propose a uniform model
                 for matching images with all types of texts, for
                 example, tags, sentences, and articles (long texts).
                 Although cross-model retrieval alleviates the
                 heterogeneous gap among visual and textual information,
                 it can provide only a rough correspondence between two
                 modalities. In this article, we propose a more precise
                 image-text embedding method, image-sentence matching,
                 which can provide heterogeneous matching in the
                 instance level. The key issue for image-text embedding
                 is how to make the distributions of the two modalities
                 consistent in the embedding space. To address this
                 problem, some previous works on the cross-model
                 retrieval task have attempted to pull close their
                 distributions by employing adversarial learning.
                 However, the effectiveness of adversarial learning on
                 image-sentence matching has not been proved and there
                 is still not an effective method. Inspired by previous
                 works, we propose to learn a modality-invariant
                 image-text embedding for image-sentence matching by
                 involving adversarial learning. On top of the triplet
                 loss--based baseline, we design a modality
                 classification network with an adversarial loss, which
                 classifies an embedding into either the image or text
                 modality. In addition, the multi-stage training
                 procedure is carefully designed so that the proposed
                 network not only imposes the image-text similarity
                 constraints by ground-truth labels, but also enforces
                 the image and text embedding distributions to be
                 similar by adversarial learning. Experiments on two
                 public datasets (Flickr30k and MSCOCO) demonstrate that
                 our method yields stable accuracy improvement over the
                 baseline model and that our results compare favorably
                 to the state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ma:2019:PFC,
  author =       "Ruijun Ma and Haifeng Hu and Weixuan Wang and Jia Xu
                 and Zhengming Li",
  title =        "Photorealistic Face Completion with Semantic Parsing
                 and Face Identity-Preserving Features",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "28:1--28:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300940",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3300940",
  abstract =     "Tremendous progress on deep learning has shown
                 exciting potential for a variety of face completion
                 tasks. However, most learning-based methods are limited
                 to handle general or structure specified face images
                 (e.g., well-aligned faces). In this article, we propose
                 a novel face completion algorithm, called Learning and
                 Preserving Face Completion Network (LP-FCN), which
                 simultaneously parses face images and extracts face
                 identity-preserving (FIP) features. By tackling these
                 two tasks in a mutually boosting way, the LP-FCN can
                 guide an identity preserving inference and ensure pixel
                 faithfulness of completed faces. In addition, we adopt
                 a global discriminator and a local discriminator to
                 distinguish real images from synthesized ones. By
                 training with a combined identity preserving, semantic
                 parsing and adversarial loss, the LP-FCN encourages the
                 completion results to be semantically valid and
                 visually consistent for more complicated image
                 completion tasks. Experiments show that our approach
                 obtains similar visual quality, but achieves better
                 performance on unaligned faces completion and fine
                 detailed synthesis against the state-of-the-art
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Lokoc:2019:ISS,
  author =       "Jakub Lokoc and Gregor Kovalc{\'\i}k and Bernd
                 M{\"u}nzer and Klaus Sch{\"o}ffmann and Werner Bailer
                 and Ralph Gasser and Stefanos Vrochidis and Phuong Anh
                 Nguyen and Sitapa Rujikietgumjorn and Kai Uwe Barthel",
  title =        "Interactive Search or Sequential Browsing? {A}
                 Detailed Analysis of the {Video Browser Showdown
                 2018}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1",
  pages =        "29:1--29:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3295663",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3295663",
  abstract =     "This work summarizes the findings of the 7th iteration
                 of the Video Browser Showdown (VBS) competition
                 organized as a workshop at the 24th International
                 Conference on Multimedia Modeling in Bangkok. The
                 competition focuses on video retrieval scenarios in
                 which the searched scenes were either previously
                 observed or described by another person (i.e., an
                 example shot is not available). During the event, nine
                 teams competed with their video retrieval tools in
                 providing access to a shared video collection with 600
                 hours of video content. Evaluation objectives, rules,
                 scoring, tasks, and all participating tools are
                 described in the article. In addition, we provide some
                 insights into how the different teams interacted with
                 their video browsers, which was made possible by a
                 novel interaction logging mechanism introduced for this
                 iteration of the VBS. The results collected at the VBS
                 evaluation server confirm that searching for one
                 particular scene in the collection when given a limited
                 time is still a challenging task for many of the
                 approaches that were showcased during the event. Given
                 only a short textual description, finding the correct
                 scene is even harder. In ad hoc search with multiple
                 relevant scenes, the tools were mostly able to find at
                 least one scene, whereas recall was the issue for many
                 teams. The logs also reveal that even though recent
                 exciting advances in machine learning narrow the
                 classical semantic gap problem, user-centric interfaces
                 are still required to mediate access to specific
                 content. Finally, open challenges and lessons learned
                 are presented for future VBS events.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2019:ESI,
  author =       "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb
                 {El Saddik}",
  title =        "Editorial to Special Issue on Deep Learning for
                 Intelligent Multimedia Analytics",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3292059",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3292059",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2019:DLB,
  author =       "Wei Zhang and Ting Yao and Shiai Zhu and Abdulmotaleb
                 {El Saddik}",
  title =        "Deep Learning-Based Multimedia Analytics: a Review",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3279952",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3279952",
  abstract =     "The multimedia community has witnessed the rise of
                 deep learning-based techniques in analyzing multimedia
                 content more effectively. In the past decade, the
                 convergence of deep-learning and multimedia analytics
                 has boosted the performance of several traditional
                 tasks, such as classification, detection, and
                 regression, and has also fundamentally changed the
                 landscape of several relatively new areas, such as
                 semantic segmentation, captioning, and content
                 generation. This article aims to review the development
                 path of major tasks in multimedia analytics and take a
                 look into future directions. We start by summarizing
                 the fundamental deep techniques related to multimedia
                 analytics, especially in the visual domain, and then
                 review representative high-level tasks powered by
                 recent advances. Moreover, the performance review of
                 popular benchmarks gives a pathway to technology
                 advancement and helps identify both milestone works and
                 future directions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Xie:2019:CAN,
  author =       "Hongtao Xie and Shancheng Fang and Zheng-Jun Zha and
                 Yating Yang and Yan Li and Yongdong Zhang",
  title =        "Convolutional Attention Networks for Scene Text
                 Recognition",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231737",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231737",
  abstract =     "In this article, we present Convoluitional Attention
                 Networks (CAN) for unconstrained scene text
                 recognition. Recent dominant approaches for scene text
                 recognition are mainly based on Convolutional Neural
                 Networks (CNN) and Recurrent Neural Networks (RNN),
                 where the CNN encodes images and the RNN generates
                 character sequences. Our CAN is different from these
                 methods; our CAN is completely built on CNN and
                 includes an attention mechanism. The distinctive
                 characteristics of our method include (i) CAN follows
                 encoder-decoder architecture, in which the encoder is a
                 deep two-dimensional CNN and the decoder is a
                 one-dimensional CNN; (ii) the attention mechanism is
                 applied in every convolutional layer of the decoder,
                 and we propose a novel spatial attention method using
                 average pooling; and (iii) position embeddings are
                 equipped in both a spatial encoder and a sequence
                 decoder to give our networks a sense of location. We
                 conduct experiments on standard datasets for scene text
                 recognition, including Street View Text, IIIT5K, and
                 ICDAR datasets. The experimental results validate the
                 effectiveness of different components and show that our
                 convolutional-based method achieves state-of-the-art or
                 competitive performance over prior works, even without
                 the use of RNN.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2019:SAD,
  author =       "Zhineng Chen and Shanshan Ai and Caiyan Jia",
  title =        "Structure-Aware Deep Learning for Product Image
                 Classification",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231742",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231742",
  abstract =     "Automatic product image classification is a task of
                 crucial importance with respect to the management of
                 online retailers. Motivated by recent advancements of
                 deep Convolutional Neural Networks (CNN) on image
                 classification, in this work we revisit the problem in
                 the context of product images with the existence of a
                 predefined categorical hierarchy and attributes, aiming
                 to leverage the hierarchy and attributes to improve
                 classification accuracy. With these structure-aware
                 clues, we argue that more advanced deep models could be
                 developed beyond the flat one-versus-all classification
                 performed by conventional CNNs. To this end, novel
                 efforts of this work include a salient-sensitive CNN
                 that gazes into the product foreground by inserting a
                 dedicated spatial attention module; a multiclass
                 regression-based refinement that is expected to predict
                 more accurately by merging prediction scores from
                 multiple preceding CNNs, each corresponding to a
                 distinct classifier in the hierarchy; and a multitask
                 deep learning architecture that effectively explores
                 correlations among categories and attributes for
                 categorical label prediction. Experimental results on
                 nearly 1 million real-world product images basically
                 validate the effectiveness of the proposed efforts
                 individually and jointly, from which performance gains
                 are observed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Jiang:2019:DPR,
  author =       "Shuqiang Jiang and Gongwei Chen and Xinhang Song and
                 Linhu Liu",
  title =        "Deep Patch Representations with Shared Codebook for
                 Scene Classification",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231738",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231738",
  abstract =     "Scene classification is a challenging problem.
                 Compared with object images, scene images are more
                 abstract, as they are composed of objects. Object and
                 scene images have different characteristics with
                 different scales and composition structures. How to
                 effectively integrate the local mid-level semantic
                 representations including both object and scene
                 concepts needs to be investigated, which is an
                 important aspect for scene classification. In this
                 article, the idea of a sharing codebook is introduced
                 by organically integrating deep learning, concept
                 feature, and local feature encoding techniques. More
                 specifically, the shared local feature codebook is
                 generated from the combined ImageNet1K and Places365
                 concepts (Mixed1365) using convolutional neural
                 networks. As the Mixed1365 features cover all the
                 semantic information including both object and scene
                 concepts, we can extract a shared codebook from the
                 Mixed1365 features, which only contain a subset of the
                 whole 1,365 concepts with the same codebook size. The
                 shared codebook can not only provide complementary
                 representations without additional codebook training
                 but also be adaptively extracted toward different scene
                 classification tasks. A method of fusing the encoded
                 features with both the original codebook and the shared
                 codebook is proposed for scene classification. In this
                 way, more comprehensive and representative image
                 features can be generated for classification. Extensive
                 experimentations conducted on two public datasets
                 validate the effectiveness of the proposed method.
                 Besides, some useful observations are also revealed to
                 show the advantage of shared codebook.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2019:VCR,
  author =       "Rui-Wei Zhao and Qi Zhang and Zuxuan Wu and Jianguo Li
                 and Yu-Gang Jiang",
  title =        "Visual Content Recognition by Exploiting Semantic
                 Feature Map with Attention and Multi-task Learning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231739",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231739",
  abstract =     "Recent studies have shown that spatial relationships
                 among objects are very important for visual
                 recognition, since they can provide rich clues on
                 object contexts within the images. In this article, we
                 introduce a novel method to learn the Semantic Feature
                 Map (SFM) with attention-based deep neural networks for
                 image and video classification in an end-to-end manner,
                 aiming to explicitly model the spatial object contexts
                 within the images. In particular, we explicitly apply
                 the designed gate units to the extracted object
                 features for important objects selection and noise
                 removal. These selected object features are then
                 organized into the proposed SFM, which is a compact and
                 discriminative representation with the spatial
                 information among objects preserved. Finally, we employ
                 either Fully Convolutional Networks (FCN) or Long-Short
                 Term Memory (LSTM) as the classifiers on top of the SFM
                 for content recognition. A novel multi-task learning
                 framework with image classification loss, object
                 localization loss, and grid labeling loss are also
                 introduced to help better learn the model parameters.
                 We conduct extensive evaluations and comparative
                 studies to verify the effectiveness of the proposed
                 approach on Pascal VOC 2007/2012 and MS-COCO benchmarks
                 for image classification. In addition, the experimental
                 results also show that the SFMs learned from the image
                 domain can be successfully transferred to CCV and FCVID
                 benchmarks for video classification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2019:CMF,
  author =       "Xueliang Liu and Meng Wang and Zheng-Jun Zha and
                 Richang Hong",
  title =        "Cross-Modality Feature Learning via Convolutional
                 Autoencoder",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231740",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231740",
  abstract =     "Learning robust and representative features across
                 multiple modalities has been a fundamental problem in
                 machine learning and multimedia fields. In this
                 article, we propose a novel MUltimodal Convolutional
                 AutoEncoder (MUCAE) approach to learn representative
                 features from visual and textual modalities. For each
                 modality, we integrate the convolutional operation into
                 an autoencoder framework to learn a joint
                 representation from the original image and text
                 content. We optimize the convolutional autoencoders of
                 different modalities jointly by exploiting the
                 correlation between the hidden representations from the
                 convolutional autoencoders, in particular by minimizing
                 both the reconstructing error of each modality and the
                 correlation divergence between the hidden feature of
                 different modalities. Compared to the conventional
                 solutions relying on hand-crafted features, the
                 proposed MUCAE approach encodes features from image
                 pixels and text characters directly and produces more
                 representative and robust features. We evaluate MUCAE
                 on cross-media retrieval as well as unimodal
                 classification tasks over real-world large-scale
                 multimedia databases. Experimental results have shown
                 that MUCAE performs better than the state-of-the-arts
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2019:DCN,
  author =       "Jiawei Liu and Zheng-Jun Zha and Xuejin Chen and Zilei
                 Wang and Yongdong Zhang",
  title =        "Dense {$3$D}-Convolutional Neural Network for Person
                 Re-Identification in Videos",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231741",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3231741",
  abstract =     "Person re-identification aims at identifying a certain
                 pedestrian across non-overlapping multi-camera networks
                 in different time and places. Existing person
                 re-identification approaches mainly focus on matching
                 pedestrians on images; however, little attention has
                 been paid to re-identify pedestrians in videos.
                 Compared to images, video clips contain motion patterns
                 of pedestrians, which is crucial to person
                 re-identification. Moreover, consecutive video frames
                 present pedestrian appearance with different body poses
                 and from different viewpoints, providing valuable
                 information toward addressing the challenge of pose
                 variation, occlusion, and viewpoint change, and so on.
                 In this article, we propose a Dense 3D-Convolutional
                 Network (D3DNet) to jointly learn spatio-temporal and
                 appearance representation for person re-identification
                 in videos. The D3DNet consists of multiple
                 three-dimensional (3D) dense blocks and transition
                 layers. The 3D dense blocks enlarge the receptive
                 fields of visual neurons in both spatial and temporal
                 dimensions, leading to discriminative appearance
                 representation as well as short-term and long-term
                 motion patterns of pedestrians without the requirement
                 of an additional motion estimation module. Moreover, we
                 formulate a loss function consisting of an
                 identification loss and a center loss to minimize
                 intra-class variance and maximize inter-class variance
                 simultaneously, toward addressing the challenge of
                 large intra-class variance and small inter-class
                 variance. Extensive experiments on two real-world video
                 datasets of person identification, i.e., MARS and
                 iLIDS-VID, have shown the effectiveness of the proposed
                 approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2019:DSM,
  author =       "Liang Zhao and Zhikui Chen and Laurence T. Yang and M.
                 Jamal Deen and Z. Jane Wang",
  title =        "Deep Semantic Mapping for Heterogeneous Multimedia
                 Transfer Learning Using Co-Occurrence Data",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241055",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3241055",
  abstract =     "Transfer learning, which focuses on finding a
                 favorable representation for instances of different
                 domains based on auxiliary data, can mitigate the
                 divergence between domains through knowledge transfer.
                 Recently, increasing efforts on transfer learning have
                 employed deep neural networks (DNN) to learn more
                 robust and higher level feature representations to
                 better tackle cross-media disparities. However, only a
                 few articles consider the correction and semantic
                 matching between multi-layer heterogeneous domain
                 networks. In this article, we propose a deep semantic
                 mapping model for heterogeneous multimedia transfer
                 learning (DHTL) using co-occurrence data. More
                 specifically, we integrate the DNN with canonical
                 correlation analysis (CCA) to derive a deep correlation
                 subspace as the joint semantic representation for
                 associating data across different domains. In the
                 proposed DHTL, a multi-layer correlation matching
                 network across domains is constructed, in which the CCA
                 is combined to bridge each pair of domain-specific
                 hidden layers. To train the network, a joint objective
                 function is defined and the optimization processes are
                 presented. When the deep semantic representation is
                 achieved, the shared features of the source domain are
                 transferred for task learning in the target domain.
                 Extensive experiments for three multimedia recognition
                 applications demonstrate that the proposed DHTL can
                 effectively find deep semantic representations for
                 heterogeneous domains, and it is superior to the
                 several existing state-of-the-art methods for deep
                 transfer learning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hossain:2019:ADL,
  author =       "M. Shamim Hossain and Syed Umar Amin and Mansour
                 Alsulaiman and Ghulam Muhammad",
  title =        "Applying Deep Learning for Epilepsy Seizure Detection
                 and Brain Mapping Visualization",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241056",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3241056",
  abstract =     "Deep Convolutional Neural Network (CNN) has achieved
                 remarkable results in computer vision tasks for
                 end-to-end learning. We evaluate here the power of a
                 deep CNN to learn robust features from raw
                 Electroencephalogram (EEG) data to detect seizures.
                 Seizures are hard to detect, as they vary both inter-
                 and intra-patient. In this article, we use a deep CNN
                 model for seizure detection task on an open-access EEG
                 epilepsy dataset collected at the Boston Children's
                 Hospital. Our deep learning model is able to extract
                 spectral, temporal features from EEG epilepsy data and
                 use them to learn the general structure of a seizure
                 that is less sensitive to variations. For cross-patient
                 EEG data, our method produced an overall sensitivity of
                 90.00\%, specificity of 91.65\%, and overall accuracy
                 of 98.05\% for the whole dataset of 23 patients. The
                 system can detect seizures with an accuracy of 99.46\%.
                 Thus, it can be used as an excellent cross-patient
                 seizure classifier. The results show that our model
                 performs better than the previous state-of-the-art
                 models for patient-specific and cross-patient seizure
                 detection task. The method gave an overall accuracy of
                 99.65\% for patient-specific data. The system can also
                 visualize the special orientation of band power
                 features. We use correlation maps to relate spectral
                 amplitude features to the output in the form of images.
                 By using the results from our deep learning model, this
                 visualization method can be used as an effective
                 multimedia tool for producing quick and relevant brain
                 mapping images that can be used by medical experts for
                 further investigation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Alameda-Pineda:2019:SSM,
  author =       "Xavier Alameda-Pineda and Miriam Redi and Mohammad
                 Soleymani and Nicu Sebe and Shih-Fu Chang and Samuel
                 Gosling",
  title =        "Special Section on Multimodal Understanding of Social,
                 Affective, and Subjective Attributes",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3292061",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3292061",
  abstract =     "Multimedia scientists have largely focused their
                 research on the recognition of tangible properties of
                 data such as objects and scenes. Recently, the field
                 has started evolving toward the modeling of more
                 complex properties. For example, the understanding of
                 social, affective, and subjective attributes of visual
                 data has attracted the attention of many research teams
                 at the crossroads of computer vision, multimedia, and
                 social sciences. These intangible attributes include,
                 for example, visual beauty, video popularity, or user
                 behavior. Multiple, diverse challenges arise when
                 modeling such properties from multimedia data. The
                 sections concern technical aspects such as reliable
                 groundtruth collection, the effective learning of
                 subjective properties, or the impact of context in
                 subjective perception; see Refs. [2] and [3].",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2019:VPI,
  author =       "Chuan-Shen Hu and Yi-Tsung Hsieh and Hsiao-Wei Lin and
                 Mei-Chen Yeh",
  title =        "{Virtual Portraitist}: an Intelligent Tool for Taking
                 Well-Posed Selfies",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3288760",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3288760",
  abstract =     "Smart photography carries the promise of quality
                 improvement and functionality extension in making
                 aesthetically appealing pictures. In this article, we
                 focus on self-portrait photographs and introduce new
                 methods that guide a user in how to best pose while
                 taking a selfie. While most of the current solutions
                 use a post processing procedure to beautify a picture,
                 the developed tool enables a novel function of
                 recommending a good look before the photo is captured.
                 Given an input face image, the tool automatically
                 estimates the pose-based aesthetic score, finds the
                 most attractive angle of the face, and suggests how the
                 pose should be adjusted. The recommendation results are
                 determined adaptively to the appearance and initial
                 pose of the input face. We apply a data mining approach
                 to find distinctive, frequent itemsets and association
                 rules from online profile pictures, upon which the
                 aesthetic estimation and pose recommendation methods
                 are developed. A simulated and a real image set are
                 used for experimental evaluation. The results show the
                 proposed aesthetic estimation method can effectively
                 select user-favorable photos. Moreover, the
                 recommendation performance for the vertical adjustment
                 is moderately related to the degree of conformity among
                 the professional photographers' recommendations. This
                 study echoes the trend of instant photo sharing, in
                 which a user takes a picture and then immediately
                 shares it on a social network without engaging in
                 tedious editing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Okada:2019:MDG,
  author =       "Shogo Okada and Laurent Son Nguyen and Oya Aran and
                 Daniel Gatica-Perez",
  title =        "Modeling Dyadic and Group Impressions with Intermodal
                 and Interperson Features",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3265754",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3265754",
  abstract =     "This article proposes a novel feature-extraction
                 framework for inferring impression personality traits,
                 emergent leadership skills, communicative competence,
                 and hiring decisions. The proposed framework extracts
                 multimodal features, describing each participant's
                 nonverbal activities. It captures intermodal and
                 interperson relationships in interactions and captures
                 how the target interactor generates nonverbal behavior
                 when other interactors also generate nonverbal
                 behavior. The intermodal and interperson patterns are
                 identified as frequent co-occurring events based on
                 clustering from multimodal sequences. The proposed
                 framework is applied to the SONVB corpus, which is an
                 audiovisual dataset collected from dyadic job
                 interviews, and the ELEA audiovisual data corpus, which
                 is a dataset collected from group meetings. We evaluate
                 the framework on a binary classification task involving
                 15 impression variables from the two data corpora. The
                 experimental results show that the model trained with
                 co-occurrence features is more accurate than previous
                 models for 14 out of 15 traits.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhao:2019:PER,
  author =       "Sicheng Zhao and Amir Gholaminejad and Guiguang Ding
                 and Yue Gao and Jungong Han and Kurt Keutzer",
  title =        "Personalized Emotion Recognition by Personality-Aware
                 High-Order Learning of Physiological Signals",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3233184",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3233184",
  abstract =     "Due to the subjective responses of different subjects
                 to physical stimuli, emotion recognition methodologies
                 from physiological signals are increasingly becoming
                 personalized. Existing works mainly focused on modeling
                 the involved physiological corpus of each subject,
                 without considering the psychological factors, such as
                 interest and personality. The latent correlation among
                 different subjects has also been rarely examined. In
                 this article, we propose to investigate the influence
                 of personality on emotional behavior in a hypergraph
                 learning framework. Assuming that each vertex is a
                 compound tuple (subject, stimuli), multi-modal
                 hypergraphs can be constructed based on the personality
                 correlation among different subjects and on the
                 physiological correlation among corresponding stimuli.
                 To reveal the different importance of vertices,
                 hyperedges, and modalities, we learn the weights for
                 each of them. As the hypergraphs connect different
                 subjects on the compound vertices, the emotions of
                 multiple subjects can be simultaneously recognized. In
                 this way, the constructed hypergraphs are
                 vertex-weighted multi-modal multi-task ones. The
                 estimated factors, referred to as emotion relevance,
                 are employed for emotion recognition. We carry out
                 extensive experiments on the ASCERTAIN dataset and the
                 results demonstrate the superiority of the proposed
                 method, as compared to the state-of-the-art emotion
                 recognition approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Trabelsi:2019:UDS,
  author =       "Rim Trabelsi and Jagannadan Varadarajan and Le Zhang
                 and Issam Jabri and Yong Pei and Fethi Smach and Ammar
                 Bouallegue and Pierre Moulin",
  title =        "Understanding the Dynamics of Social Interactions: a
                 Multi-Modal Multi-View Approach",
  journal =      j-TOMM,
  volume =       "15",
  number =       "1s",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300937",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3300937",
  abstract =     "In this article, we deal with the problem of
                 understanding human-to-human interactions as a
                 fundamental component of social events analysis.
                 Inspired by the recent success of multi-modal visual
                 data in many recognition tasks, we propose a novel
                 approach to model dyadic interaction by means of
                 features extracted from synchronized 3D skeleton
                 coordinates, depth, and Red Green Blue (RGB) sequences.
                 From skeleton data, we extract new view-invariant
                 proxemic features, named Unified Proxemic Descriptor
                 (UProD), which is able to incorporate intrinsic and
                 extrinsic distances between two interacting subjects. A
                 novel key frame selection method is introduced to
                 identify salient instants of the interaction sequence
                 based on the joints' energy. From Red Green Blue Depth
                 (RGBD) videos, more holistic CNN features are extracted
                 by applying an adaptive pre-trained Convolutional
                 Neural Networks (CNNs) on optical flow frames. For
                 better understanding the dynamics of interactions, we
                 expand the boundaries of dyadic interactions analysis
                 by proposing a fundamentally new modeling for
                 non-treated problem aiming to discern the active from
                 the passive interactor. Extensive experiments have been
                 carried out on four multi-modal and multi-view
                 interactions datasets. The experimental results
                 demonstrate the superiority of our proposed techniques
                 against the state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gan:2019:MSF,
  author =       "Tian Gan and Junnan Li and Yongkang Wong and Mohan S.
                 Kankanhalli",
  title =        "A Multi-sensor Framework for Personal Presentation
                 Analytics",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "30:1--30:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300941",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3300941",
  abstract =     "Presentation has been an effective method for
                 delivering information to an audience for many years.
                 Over the past few decades, technological advancements
                 have revolutionized the way humans deliver
                 presentation. Conventionally, the quality of a
                 presentation is usually evaluated through painstaking
                 manual analysis with experts. Although the expert
                 feedback is effective in assisting users to improve
                 their presentation skills, manual evaluation suffers
                 from high cost and is often not available to most
                 individuals. In this work, we propose a novel
                 multi-sensor self-quantification system for
                 presentations, which is designed based on a new
                 proposed assessment rubric. We present our analytics
                 model with conventional ambient sensors (i.e., static
                 cameras and Kinect sensor) and the emerging wearable
                 egocentric sensors (i.e., Google Glass). In addition,
                 we performed a cross-correlation analysis of speaker's
                 vocal behavior and body language. The proposed
                 framework is evaluated on a new presentation dataset,
                 namely, NUS Multi-Sensor Presentation dataset, which
                 consists of 51 presentations covering a diverse range
                 of topics. To validate the efficacy of the proposed
                 system, we have conducted a series of user studies with
                 the speakers and an interview with an English
                 communication expert, which reveals positive and
                 promising feedback.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Tang:2019:RVL,
  author =       "Pengjie Tang and Hanli Wang and Qinyu Li",
  title =        "Rich Visual and Language Representation with
                 Complementary Semantics for Video Captioning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "31:1--31:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3303083",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3303083",
  abstract =     "It is interesting and challenging to translate a video
                 to natural description sentences based on the video
                 content. In this work, an advanced framework is built
                 to generate sentences with coherence and rich semantic
                 expressions for video captioning. A long short term
                 memory (LSTM) network with an improved factored way is
                 first developed, which takes the inspiration of LSTM
                 with a conventional factored way and a common practice
                 to feed multi-modal features into LSTM at the first
                 time step for visual description. Then, the
                 incorporation of the LSTM network with the proposed
                 improved factored way and un-factored way is exploited,
                 and a voting strategy is utilized to predict candidate
                 words. In addition, for robust and abstract visual and
                 language representation, residuals are employed to
                 enhance the gradient signals that are learned from the
                 residual network (ResNet), and a deeper LSTM network is
                 constructed. Furthermore, three convolutional neural
                 network based features extracted from GoogLeNet,
                 ResNet101, and ResNet152, are fused to catch more
                 comprehensive and complementary visual information.
                 Experiments are conducted on two benchmark datasets,
                 including MSVD and MSR-VTT2016, and competitive
                 performances are obtained by the proposed techniques as
                 compared to other state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2019:MLS,
  author =       "Chen Shen and Zhongming Jin and Wenqing Chu and
                 Rongxin Jiang and Yaowu Chen and Guo-Jun Qi and
                 Xian-Sheng Hua",
  title =        "Multi-level Similarity Perception Network for Person
                 Re-identification",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "32:1--32:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309881",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309881",
  abstract =     "In this article, we propose a novel deep Siamese
                 architecture based on a convolutional neural network
                 (CNN) and multi-level similarity perception for the
                 person re-identification (re-ID) problem. According to
                 the distinct characteristics of diverse feature maps,
                 we effectively apply different similarity constraints
                 to both low-level and high-level feature maps during
                 training stage. Due to the introduction of appropriate
                 similarity comparison mechanisms at different levels,
                 the proposed approach can adaptively learn
                 discriminative local and global feature
                 representations, respectively, while the former is more
                 sensitive in localizing part-level prominent patterns
                 relevant to re-identifying people across cameras.
                 Meanwhile, a novel strong activation pooling strategy
                 is utilized on the last convolutional layer for
                 abstract local-feature aggregation to pursue more
                 representative feature representations. Based on this,
                 we propose final feature embedding by simultaneously
                 encoding original global features and discriminative
                 local features. In addition, our framework has two
                 other benefits: First, classification constraints can
                 be easily incorporated into the framework, forming a
                 unified multi-task network with similarity constraints.
                 Second, as similarity-comparable information has been
                 encoded in the network's learning parameters via
                 back-propagation, pairwise input is not necessary at
                 test time. That means we can extract features of each
                 gallery image and build an index in an off-line manner,
                 which is essential for large-scale real-world
                 applications. Experimental results on multiple
                 challenging benchmarks demonstrate that our method
                 achieves splendid performance compared with the current
                 state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Miao:2019:DLS,
  author =       "Yu Miao and Haiwei Dong and Jihad Mohamad {Al Jaam}
                 and Abdulmotaleb {El Saddik}",
  title =        "A Deep Learning System for Recognizing Facial
                 Expression in Real-Time",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "33:1--33:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311747",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311747",
  abstract =     "This article presents an image-based real-time facial
                 expression recognition system that is able to recognize
                 the facial expressions of several subjects on a webcam
                 at the same time. Our proposed methodology combines a
                 supervised transfer learning strategy and a joint
                 supervision method with center loss, which is crucial
                 for facial tasks. A newly proposed Convolutional Neural
                 Network (CNN) model, MobileNet, which has both accuracy
                 and speed, is deployed in both offline and in a
                 real-time framework that enables fast and accurate
                 real-time output. Evaluations towards two publicly
                 available datasets, JAFFE and CK+, are carried out
                 respectively. The JAFFE dataset reaches an accuracy of
                 95.24\%, while an accuracy of 96.92\% is achieved on
                 the 6-class CK+ dataset, which contains only the last
                 frames of image sequences. At last, the average
                 run-time cost for the recognition of the real-time
                 implementation is around 3.57ms/frame on a NVIDIA
                 Quadro K4200 GPU.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mesfin:2019:UET,
  author =       "Gebremariam Mesfin and Nadia Hussain and Alexandra
                 Covaci and Gheorghita Ghinea",
  title =        "Using Eye Tracking and Heart-Rate Activity to Examine
                 Crossmodal Correspondences {QoE} in {Mulsemedia}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "34:1--34:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3303080",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3303080",
  abstract =     "Different senses provide us with information of
                 various levels of precision and enable us to construct
                 a more precise representation of the world. Rich
                 multisensory simulations are thus beneficial for
                 comprehension, memory reinforcement, or retention of
                 information. Crossmodal mappings refer to the
                 systematic associations often made between different
                 sensory modalities (e.g., high pitch is matched with
                 angular shapes) and govern multisensory processing. A
                 great deal of research effort has been put into
                 exploring cross-modal correspondences in the field of
                 cognitive science. However, the possibilities they open
                 in the digital world have been relatively unexplored.
                 Multiple sensorial media (mulsemedia) provides a highly
                 immersive experience to the users and enhances their
                 Quality of Experience (QoE) in the digital world. Thus,
                 we consider that studying the plasticity and the
                 effects of cross-modal correspondences in a mulsemedia
                 setup can bring interesting insights about improving
                 the human computer dialogue and experience. In our
                 experiments, we exposed users to videos with certain
                 visual dimensions (brightness, color, and shape), and
                 we investigated whether the pairing with a cross-modal
                 matching sound (high and low pitch) and the
                 corresponding auto-generated vibrotactile effects
                 (produced by a haptic vest) lead to an enhanced QoE.
                 For this, we captured the eye gaze and the heart rate
                 of users while experiencing mulsemedia, and we asked
                 them to fill in a set of questions targeting their
                 enjoyment and perception at the end of the experiment.
                 Results showed differences in eye-gaze patterns and
                 heart rate between the experimental and the control
                 group, indicating changes in participants' engagement
                 when videos were accompanied by matching cross-modal
                 sounds (this effect was the strongest for the video
                 displaying angular shapes and high-pitch audio) and
                 transitively generated cross-modal vibrotactile
                 effects.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cheung:2019:DOC,
  author =       "Ming Cheung and James She and Weiwei Sun and Jiantao
                 Zhou",
  title =        "Detecting Online Counterfeit-goods Seller using
                 Connection Discovery",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "35:1--35:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311785",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311785",
  abstract =     "With the advancement of social media and mobile
                 technology, any smartphone user can easily become a
                 seller on social media and e-commerce platforms, such
                 as Instagram and Carousell in Hong Kong or Taobao in
                 China. A seller shows images of their products and
                 annotates their images with suitable tags that can be
                 searched easily by others. Those images could be taken
                 by the seller, or the seller could use images shared by
                 other sellers. Among sellers, some sell counterfeit
                 goods, and these sellers may use disguising tags and
                 language, which make detecting them a difficult task.
                 This article proposes a framework to detect counterfeit
                 sellers by using deep learning to discover connections
                 among sellers from their shared images. Based on 473K
                 shared images from Taobao, Instagram, and Carousell, it
                 is proven that the proposed framework can detect
                 counterfeit sellers. The framework is 30\% better than
                 approaches using object recognition in detecting
                 counterfeit sellers. To the best of our knowledge, this
                 is the first work to detect online counterfeit sellers
                 from their shared images.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yarnagula:2019:QMC,
  author =       "Hema Kumar Yarnagula and Parikshit Juluri and Sheyda
                 Kiani Mehr and Venkatesh Tamarapalli and Deep Medhi",
  title =        "{QoE} for Mobile Clients with {Segment-aware Rate
                 Adaptation Algorithm (SARA)} for {DASH} Video
                 Streaming",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "36:1--36:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311749",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311749",
  abstract =     "Dynamic adaptive streaming over HTTP (DASH) is widely
                 used for video streaming on mobile devices. Ensuring a
                 good quality of experience (QoE) for mobile video
                 streaming is essential, as it severely impacts both the
                 network and content providers' revenue. Thus, a good
                 rate adaptation algorithm at the client end that
                 provides high QoE is critically important. Recently, a
                 segment size-aware rate adaptation (SARA) algorithm was
                 proposed for DASH clients. However, its performance on
                 mobile clients has not been investigated so far. The
                 main contributions of this article are twofold: (1) We
                 discuss SARA's implementation for mobile clients to
                 improve the QoE in mobile video streaming, one that
                 accurately predicts the download time for the next
                 segment and makes an informed bitrate selection, and
                 (2) we developed a new parametric QoE model to compute
                 a cumulative score that helps in fair comparison of
                 different adaptation algorithms. Based on our
                 subjective and objective evaluation, we observed that
                 SARA for mobile clients outperforms others by 17\% on
                 average, in terms of the Mean Opinion Score, while
                 achieving, on average, a 76\% improvement in terms of
                 the interruption ratio. The score obtained from our new
                 parametric QoE model also demonstrates that the SARA
                 algorithm for mobile clients gives a better QoE among
                 all the algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Atrey:2019:WMD,
  author =       "Pradeep K. Atrey and Bakul Trehan and Mukesh K.
                 Saini",
  title =        "Watch Me from Distance {(WMD)}: a Privacy-Preserving
                 Long-Distance Video Surveillance System",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "37:1--37:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3312574",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3312574",
  abstract =     "Preserving the privacy of people in video surveillance
                 systems is quite challenging, and a significant amount
                 of research has been done to solve this problem in
                 recent times. Majority of existing techniques are based
                 on detecting bodily cues such as face and/or silhouette
                 and obscuring them so that people in the videos cannot
                 be identified. We observe that merely hiding bodily
                 cues is not enough for protecting identities of the
                 individuals in the videos. An adversary, who has prior
                 contextual knowledge about the surveilled area, can
                 identify people in the video by exploiting the implicit
                 inference channels such as behavior, place, and time.
                 This article presents an anonymous surveillance system,
                 called Watch Me from Distance (WMD), which advocates
                 for outsourcing of surveillance video monitoring
                 (similar to call centers) to the long-distance sites
                 where professional security operators watch the video
                 and alert the local site when any suspicious or
                 abnormal event takes place. We find that long-distance
                 monitoring helps in decoupling the contextual knowledge
                 of security operators. Since security operators at the
                 remote site could turn into adversaries, a trust
                 computation model to determine the credibility of the
                 operators is presented as an integral part of the
                 proposed system. The feasibility study and experiments
                 suggest that the proposed system provides more robust
                 measures of privacy yet maintains surveillance
                 effectiveness.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hsu:2019:LMC,
  author =       "Chih-Fan Hsu and Yu-Shuen Wang and Chin-Laung Lei and
                 Kuan-Ta Chen",
  title =        "Look at Me! {Correcting} Eye Gaze in Live Video
                 Communication",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "38:1--38:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311784",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311784",
  abstract =     "Although live video communication is widely used, it
                 is generally less engaging than face-to-face
                 communication because of limitations on social,
                 emotional, and haptic feedback. Missing eye contact is
                 one such problem caused by the physical deviation
                 between the screen and camera on a device. Manipulating
                 video frames to correct eye gaze is a solution to this
                 problem. In this article, we introduce a system to
                 rotate the eyeball of a local participant before the
                 video frame is sent to the remote side. It adopts a
                 warping-based convolutional neural network to relocate
                 pixels in eye regions. To improve visual quality, we
                 minimize the L2 distance between the ground truths and
                 warped eyes. We also present several newly designed
                 loss functions to help network training. These new loss
                 functions are designed to preserve the shape of eye
                 structures and minimize color changes around the
                 periphery of eye regions. To evaluate the presented
                 network and loss functions, we objectively and
                 subjectively compared results generated by our system
                 and the state-of-the-art, DeepWarp, in relation to two
                 datasets. The experimental results demonstrated the
                 effectiveness of our system. In addition, we showed
                 that our system can perform eye-gaze correction in real
                 time on a consumer-level laptop. Because of the quality
                 and efficiency of the system, gaze correction by
                 postprocessing through this system is a feasible
                 solution to the problem of missing eye contact in video
                 communication.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ahmad:2019:HDF,
  author =       "Kashif Ahmad and Nicola Conci",
  title =        "How Deep Features Have Improved Event Recognition in
                 Multimedia: a Survey",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "39:1--39:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3306240",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3306240",
  abstract =     "Event recognition is one of the areas in multimedia
                 that is attracting great attention of researchers.
                 Being applicable in a wide range of applications, from
                 personal to collective events, a number of interesting
                 solutions for event recognition using multimedia
                 information sources have been proposed. On the other
                 hand, following their immense success in
                 classification, object recognition, and detection, deep
                 learning has been shown to perform well in event
                 recognition tasks also. Thus, a large portion of the
                 literature on event analysis relies nowadays on deep
                 learning architectures. In this article, we provide an
                 extensive overview of the existing literature in this
                 field, analyzing how deep features and deep learning
                 architectures have changed the performance of event
                 recognition frameworks. The literature on event-based
                 analysis of multimedia contents can be categorized into
                 four groups, namely (i) event recognition in single
                 images; (ii) event recognition in personal photo
                 collections; (iii) event recognition in videos; and
                 (iv) event recognition in audio recordings. In this
                 article, we extensively review different
                 deep-learning-based frameworks for event recognition in
                 these four domains. Furthermore, we also review some
                 benchmark datasets made available to the scientific
                 community to validate novel event recognition
                 pipelines. In the final part of the manuscript, we also
                 provide a detailed discussion on basic insights
                 gathered from the literature review, and identify
                 future trends and challenges.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2019:ACV,
  author =       "Yadang Chen and Chuanyan Hao and Alex X. Liu and Enhua
                 Wu",
  title =        "Appearance-consistent Video Object Segmentation Based
                 on a Multinomial Event Model",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "40:1--40:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321507",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321507",
  abstract =     "In this study, we propose an effective and efficient
                 algorithm for unconstrained video object segmentation,
                 which is achieved in a Markov random field (MRF). In
                 the MRF graph, each node is modeled as a superpixel and
                 labeled as either foreground or background during the
                 segmentation process. The unary potential is computed
                 for each node by learning a transductive SVM classifier
                 under supervision by a few labeled frames. The pairwise
                 potential is used for the spatial-temporal smoothness.
                 In addition, a high-order potential based on the
                 multinomial event model is employed to enhance the
                 appearance consistency throughout the frames. To
                 minimize this intractable feature, we also introduce a
                 more efficient technique that simply extends the
                 original MRF structure. The proposed approach was
                 evaluated in experiments with different measures and
                 the results based on a benchmark demonstrated its
                 effectiveness compared with other state-of-the-art
                 algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Roberto:2019:DLS,
  author =       "Pierdicca Roberto and Frontoni Emanuele and Zingaretti
                 Primo and Mancini Adriano and Loncarski Jelena and
                 Paolanti Marina",
  title =        "Design, Large-Scale Usage Testing, and Important
                 Metrics for Augmented Reality Gaming Applications",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "41:1--41:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311748",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311748",
  abstract =     "Augmented Reality (AR) offers the possibility to
                 enrich the real world with digital mediated content,
                 increasing in this way the quality of many everyday
                 experiences. While in some research areas such as
                 cultural heritage, tourism, or medicine there is a
                 strong technological investment, AR for game purposes
                 struggles to become a widespread commercial
                 application. In this article, a novel framework for AR
                 kid games is proposed, already developed by the authors
                 for other AR applications such as Cultural Heritage and
                 Arts. In particular, the framework includes different
                 layers such as the development of a series of AR kid
                 puzzle games in an intermediate structure which can be
                 used as a standard for different applications
                 development, the development of a smart configuration
                 tool, together with general guidelines and long-life
                 usage tests and metrics. The proposed application is
                 designed for augmenting the puzzle experience, but can
                 be easily extended to other AR gaming applications.
                 Once the user has assembled the real puzzle, AR
                 functionality within the mobile application can be
                 unlocked, bringing to life puzzle characters, creating
                 a seamless game that merges AR interactions with the
                 puzzle reality. The main goals and benefits of this
                 framework can be seen in the development of a novel set
                 of AR tests and metrics in the pre-release phase (in
                 order to help the commercial launch and developers),
                 and in the release phase by introducing the measures
                 for long-life app optimization, usage tests and hint on
                 final users together with a measure to design policy,
                 providing a method for automatic testing of quality and
                 popularity improvements. Moreover, smart configuration
                 tools, as part of the general framework, enabling
                 multi-app and eventually also multi-user development,
                 have been proposed, facilitating the serialization of
                 the applications. Results were obtained from a
                 large-scale user test with about 4 million users on a
                 set of eight gaming applications, providing the
                 scientific community a workflow for implicit
                 quantitative analysis in AR gaming. Different data
                 analytics developed on the data collected by the
                 framework prove that the proposed approach is
                 affordable and reliable for long-life testing and
                 optimization.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Siarohin:2019:IIM,
  author =       "Aliaksandr Siarohin and Gloria Zen and Cveta
                 Majtanovic and Xavier Alameda-Pineda and Elisa Ricci
                 and Nicu Sebe",
  title =        "Increasing Image Memorability with Neural Style
                 Transfer",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "42:1--42:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3311781",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3311781",
  abstract =     "Recent works in computer vision and multimedia have
                 shown that image memorability can be automatically
                 inferred exploiting powerful deep-learning models. This
                 article advances the state of the art in this area by
                 addressing a novel and more challenging issue: `` Given
                 an arbitrary input image, can we make it more
                 memorable? '' To tackle this problem, we introduce an
                 approach based on an editing-by-applying-filters
                 paradigm: given an input image, we propose to
                 automatically retrieve a set of ``style seeds,'' i.e.,
                 a set of style images that, applied to the input image
                 through a neural style transfer algorithm, provide the
                 highest increase in memorability. We show the
                 effectiveness of the proposed approach with experiments
                 on the publicly available LaMem dataset, performing
                 both a quantitative evaluation and a user study. To
                 demonstrate the flexibility of the proposed framework,
                 we also analyze the impact of different implementation
                 choices, such as using different state-of-the-art
                 neural style transfer methods. Finally, we show several
                 qualitative results to provide additional insights on
                 the link between image style and memorability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Do:2019:SDC,
  author =       "Thanh-Toan Do and Tuan Hoang and Dang-Khoa Le Tan and
                 Huu Le and Tam V. Nguyen and Ngai-Man Cheung",
  title =        "From Selective Deep Convolutional Features to Compact
                 Binary Representations for Image Retrieval",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "43:1--43:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314051",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314051",
  abstract =     "In the large-scale image retrieval task, the two most
                 important requirements are the discriminability of
                 image representations and the efficiency in computation
                 and storage of representations. Regarding the former
                 requirement, Convolutional Neural Network is proven to
                 be a very powerful tool to extract highly
                 discriminative local descriptors for effective image
                 search. Additionally, to further improve the
                 discriminative power of the descriptors, recent works
                 adopt fine-tuned strategies. In this article, taking a
                 different approach, we propose a novel, computationally
                 efficient, and competitive framework. Specifically, we
                 first propose various strategies to compute masks,
                 namely, SIFT-masks, SUM-mask, and MAX-mask, to select a
                 representative subset of local convolutional features
                 and eliminate redundant features. Our in-depth analyses
                 demonstrate that proposed masking schemes are effective
                 to address the burstiness drawback and improve
                 retrieval accuracy. Second, we propose to employ recent
                 embedding and aggregating methods that can
                 significantly boost the feature discriminability.
                 Regarding the computation and storage efficiency, we
                 include a hashing module to produce very compact binary
                 image representations. Extensive experiments on six
                 image retrieval benchmarks demonstrate that our
                 proposed framework achieves the state-of-the-art
                 retrieval performances.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2019:LCS,
  author =       "Liquan Shen and Ping An and Guorui Feng",
  title =        "Low-Complexity Scalable Extension of the
                 High-Efficiency Video Coding {(SHVC)} Encoding System",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "44:1--44:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3313185",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3313185",
  abstract =     "The scalable extension of the high-efficiency video
                 coding (SHVC) system adopts a hierarchical
                 quadtree-based coding unit (CU) that is suitable for
                 various texture and motion properties of videos.
                 Currently, the test model of SHVC identifies the
                 optimal CU size by performing an exhaustive quadtree
                 depth-level search, which achieves a high compression
                 efficiency at a heavy cost in terms of the
                 computational complexity. However, many interactive
                 multimedia applications, such as remote monitoring and
                 video surveillance, which are sensitive to time delays,
                 have insufficient computational power for coding
                 high-definition (HD) and ultra-high-definition (UHD)
                 videos. Therefore, it is important, yet challenging, to
                 optimize the SHVC coding procedure and accelerate video
                 coding. In this article, we propose a fast CU quadtree
                 depth-level decision algorithm for inter-frames on
                 enhancement layers that is based on an analysis of
                 inter-layer, spatial, and temporal correlations. When
                 motion/texture properties of coding regions can be
                 identified early, a fast algorithm can be designed for
                 adapting CU depth-level decision procedures to video
                 contents and avoiding unnecessary computations during
                 CU depth-level traversal. The proposed algorithm
                 determines the motion activity level at the treeblock
                 size of the hierarchical quadtree by utilizing motion
                 vectors from its corresponding blocks at the base
                 layer. Based on the motion activity level, neighboring
                 encoded CUs that have larger correlations are
                 preferentially selected to predict the optimal depth
                 level of the current treeblock. Finally, two
                 parameters, namely, the motion activity level and the
                 predicted CU depth level, are used to identify a subset
                 of candidate CU depth levels and adaptively optimize CU
                 depth-level decision processes. The experimental
                 results demonstrate that the proposed scheme can run
                 approximately three times faster than the most recent
                 SHVC reference software, with a negligible loss of
                 compression efficiency. The proposed scheme is
                 efficient for all types of scalable video sequences
                 under various coding conditions and outperforms
                 state-of-the-art fast SHVC and HEVC algorithms. Our
                 scheme is a suitable candidate for interactive HD/UHD
                 video applications that are expected to operate in
                 real-time and power-constrained scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hu:2019:CAA,
  author =       "Jun Hu and Shengsheng Qian and Quan Fang and Xueliang
                 Liu and Changsheng Xu",
  title =        "{A$^2$ CMHNE}: Attention-Aware Collaborative
                 Multimodal Heterogeneous Network Embedding",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "45:1--45:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321506",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321506",
  abstract =     "Network representation learning is playing an
                 important role in network analysis due to its
                 effectiveness in a variety of applications. However,
                 most existing network embedding models focus on
                 homogeneous networks and neglect the diverse properties
                 such as different types of network structures and
                 associated multimedia content information. In this
                 article, we learn node representations for multimodal
                 heterogeneous networks, which contain multiple types of
                 nodes and/or links as well as multimodal content such
                 as texts and images. We propose a novel attention-aware
                 collaborative multimodal heterogeneous network
                 embedding method (A$^2$ CMHNE), where an
                 attention-based collaborative representation learning
                 approach is proposed to promote the collaboration of
                 structure-based embedding and content-based embedding,
                 and generate the robust node representation by
                 introducing an attention mechanism that enables
                 informative embedding integration. In experiments, we
                 compare our model with existing network embedding
                 models on two real-world datasets. Our method leads to
                 dramatic improvements in performance by 5\%, and 9\%
                 compared with five state-of-the-art embedding methods
                 on one benchmark (M10 Dataset), and on a multi-modal
                 heterogeneous network dataset (WeChat dataset) for node
                 classification, respectively. Experimental results
                 demonstrate the effectiveness of our proposed method on
                 both node classification and link prediction tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hosny:2019:RCI,
  author =       "Khalid M. Hosny and Mohamed M. Darwish",
  title =        "Resilient Color Image Watermarking Using Accurate
                 Quaternion Radial Substituted {Chebyshev} Moments",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "46:1--46:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325193",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3325193",
  abstract =     "In this work, a new quaternion-based method for color
                 image watermarking is proposed. In this method, a novel
                 set of quaternion radial substituted Chebyshev moments
                 (QRSCMs) is presented for robust geometrically
                 invariant image watermarking. An efficient
                 computational method is proposed for highly accurate,
                 fast, and numerically stable QRSCMs in polar
                 coordinates. The proposed watermarking method consists
                 of three stages. In the first stage, the Arnold
                 transform is used to improve the security of the
                 watermarking scheme by scrambling the binary watermark.
                 In the second stage, the proposed accurate and stable
                 QRSCMs of the host color image are computed. In the
                 third stage, the encrypted binary watermark is embedded
                 into the host image by employing the quantization
                 technique on selected-magnitude QRSCMs where the
                 watermarked color image is obtained by adding the
                 original host color image to the compensation image.
                 Then, the binary watermark can be extracted directly
                 without using the original image from the magnitudes of
                 QRSCMs. Numerical experiments are performed where the
                 performance of proposed method is compared with the
                 existing quaternion moment-based watermarking methods.
                 The comparison clearly shows that the proposed method
                 is very efficient in terms of the visual
                 imperceptibility capability and the robustness under
                 different attacks compared to the existing quaternion
                 moment-based watermarking algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Mou:2019:AVG,
  author =       "Wenxuan Mou and Hatice Gunes and Ioannis Patras",
  title =        "Alone versus In-a-group: a Multi-modal Framework for
                 Automatic Affect Recognition",
  journal =      j-TOMM,
  volume =       "15",
  number =       "2",
  pages =        "47:1--47:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321509",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:46 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321509",
  abstract =     "Recognition and analysis of human affect has been
                 researched extensively within the field of computer
                 science in the past two decades. However, most of the
                 past research in automatic analysis of human affect has
                 focused on the recognition of affect displayed by
                 people in individual settings and little attention has
                 been paid to the analysis of the affect expressed in
                 group settings. In this article, we first analyze the
                 affect expressed by each individual in terms of arousal
                 and valence dimensions in both individual and group
                 videos and then propose methods to recognize the
                 contextual information, i.e., whether a person is alone
                 or in-a-group by analyzing their face and body
                 behavioral cues. For affect analysis, we first devise
                 affect recognition models separately in individual and
                 group videos and then introduce a cross-condition
                 affect recognition model that is trained by combining
                 the two different types of data. We conduct a set of
                 experiments on two datasets that contain both
                 individual and group videos. Our experiments show that
                 (1) the proposed Volume Quantized Local Zernike Moments
                 Fisher Vector outperforms other unimodal features in
                 affect analysis; (2) the temporal learning model,
                 Long-Short Term Memory Networks, works better than the
                 static learning model, Support Vector Machine; (3)
                 decision fusion helps to improve affect recognition,
                 indicating that body behaviors carry emotional
                 information that is complementary rather than redundant
                 to the emotion content in facial behaviors; and (4) it
                 is possible to predict the context, i.e., whether a
                 person is alone or in-a-group, using their non-verbal
                 behavioral cues.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Hong:2019:ASS,
  author =       "Richang Hong",
  title =        "Advanced Stereo Seam Carving by Considering Occlusions
                 on Both Sides",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "69:1--69:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321513",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321513",
  abstract =     "Stereo image retargeting plays a significant role in
                 the field of image processing, which aims at making
                 major objects as prominent as possible when the
                 resolution of an image is changed, including
                 maintaining disparity and depth information at the same
                 time. Some seam carving methods are proposed to
                 preserve the geometric consistency of the images.
                 However, the regions of occlusion on both sides are not
                 considered properly. In this article, we propose a
                 solution to solve this problem. A new strategy of seams
                 finding is designed by considering occluded and
                 occluding regions on both of the input images, and
                 leaving geometric consistency in both images intact. We
                 also introduced the method of line segment detection
                 and superpixel segmentation to further improve the
                 quality of the images. Imaging effects are optimized in
                 the process and visual comfort, which is also
                 influenced by other factors, can be boosted as well.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2019:SET,
  author =       "Yun Zhang and Na Li and Sam Kwong and Gangyi Jiang and
                 Huanqiang Zeng",
  title =        "Statistical Early Termination and Early Skip Models
                 for Fast Mode Decision in {HEVC INTRA} Coding",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "70:1--70:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321510",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321510",
  abstract =     "In this article, statistical Early Termination (ET)
                 and Early Skip (ES) models are proposed for fast Coding
                 Unit (CU) and prediction mode decision in HEVC INTRA
                 coding, in which three categories of ET and ES
                 sub-algorithms are included. First, the CU ranges of
                 the current CU are recursively predicted based on the
                 texture and CU depth of the spatial neighboring CUs.
                 Second, the statistical model based ET and ES schemes
                 are proposed and applied to optimize the CU and INTRA
                 prediction mode decision, in which the coding
                 complexities over different decision layers are jointly
                 minimized subject to acceptable rate-distortion
                 degradation. Third, the mode correlations among the
                 INTRA prediction modes are exploited to early terminate
                 the full rate-distortion optimization in each CU
                 decision layer. Extensive experiments are performed to
                 evaluate the coding performance of each sub-algorithm
                 and the overall algorithm. Experimental results reveal
                 that the overall proposed algorithm can achieve 45.47\%
                 to 74.77\%, and 58.09\% on average complexity
                 reduction, while the overall Bj{\o}ntegaard delta bit
                 rate increase and Bj{\o}ntegaard delta peak
                 signal-to-noise ratio degradation are 2.29\% and -0.11
                 dB, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Gupta:2019:SGM,
  author =       "Abhinav Gupta and Divya Singhal",
  title =        "A Simplistic Global Median Filtering Forensics Based
                 on Frequency Domain Analysis of Image Residuals",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "71:1--71:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321508",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321508",
  abstract =     "Sophisticated image forgeries introduce digital image
                 forensics as an active area of research. In this area,
                 many researchers have addressed the problem of median
                 filtering forensics. Existing median filtering
                 detectors are adequate to classify median filtered
                 images in uncompressed mode and in compressed mode at
                 high-quality factors. Despite that, the field is
                 lacking a robust method to detect median filtering in
                 low-resolution images compressed with low-quality
                 factors. In this article, a novel feature set (four
                 feature dimensions), based on first-order statistics of
                 frequency contents of median filtered residuals (MFRs)
                 of original and median filtered images, has been
                 proposed. The proposed feature set outperforms
                 handcrafted features-based state-of-the-art detectors
                 in terms of feature set dimensions and detection
                 results obtained for low-resolution images at all
                 quality factors. Also, results reveal the efficacy of
                 proposed method over deep-learning-based median
                 filtering detector. Comprehensive results expose the
                 efficacy of the proposed detector to detect median
                 filtering against other similar manipulations.
                 Additionally, generalization ability test on
                 cross-database images support the cross-validation
                 results on four different databases. Thus, our proposed
                 detector meets the current challenges in the field, to
                 a great extent.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2019:HVO,
  author =       "Kan Wu and Guanbin Li and Haofeng Li and Jianjun Zhang
                 and Yizhou Yu",
  title =        "Harvesting Visual Objects from {Internet} Images via
                 Deep-Learning-Based Objectness Assessment",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "72:1--72:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3318463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3318463",
  abstract =     "The collection of internet images has been growing in
                 an astonishing speed. It is undoubted that these images
                 contain rich visual information that can be useful in
                 many applications, such as visual media creation and
                 data-driven image synthesis. In this article, we focus
                 on the methodologies for building a visual object
                 database from a collection of internet images. Such
                 database is built to contain a large number of
                 high-quality visual objects that can help with various
                 data-driven image applications. Our method is based on
                 dense proposal generation and objectness-based
                 re-ranking. A novel deep convolutional neural network
                 is designed for the inference of proposal objectness,
                 the probability of a proposal containing optimally
                 located foreground object. In our work, the objectness
                 is quantitatively measured in regard of completeness
                 and fullness, reflecting two complementary features of
                 an optimal proposal: a complete foreground and
                 relatively small background. Our experiments indicate
                 that object proposals re-ranked according to the output
                 of our network generally achieve higher performance
                 than those produced by other state-of-the-art methods.
                 As a concrete example, a database of over 1.2 million
                 visual objects has been built using the proposed
                 method, and has been successfully used in various
                 data-driven image applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yuan:2019:SSP,
  author =       "Yuan Yuan and Jie Fang and Xiaoqiang Lu and Yachuang
                 Feng",
  title =        "Spatial Structure Preserving Feature Pyramid Network
                 for Semantic Image Segmentation",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "73:1--73:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321512",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321512",
  abstract =     "Recently, progress on semantic image segmentation is
                 substantial, benefiting from the rapid development of
                 Convolutional Neural Networks. Semantic image
                 segmentation approaches proposed lately have been
                 mostly based on Fully convolutional Networks (FCNs).
                 However, these FCN-based methods use large receptive
                 fields and too many pooling layers to depict the
                 discriminative semantic information of the images.
                 Specifically, on one hand, convolutional kernel with
                 large receptive field smooth the detailed edges, since
                 too much contexture information is used to depict the
                 ``center pixel.'' However, the pooling layer increases
                 the receptive field through zooming out the latest
                 feature maps, which loses many detailed information of
                 the image, especially in the deeper layers of the
                 network. These operations often cause low spatial
                 resolution inside deep layers, which leads to spatially
                 fragmented prediction. To address this problem, we
                 exploit the inherent multi-scale and pyramidal
                 hierarchy of deep convolutional networks to extract the
                 feature maps with different resolutions and take full
                 advantages of these feature maps via a gradually
                 stacked fusing way. Specifically, for two adjacent
                 convolutional layers, we upsample the features from
                 deeper layer with stride of 2 and then stack them on
                 the features from shallower layer. Then, a
                 convolutional layer with kernels of 1$ \times $ 1 is
                 followed to fuse these stacked features. The fused
                 feature preserves the spatial structure information of
                 the image; meanwhile, it owns strong discriminative
                 capability for pixel classification. Additionally, to
                 further preserve the spatial structure information and
                 regional connectivity of the predicted category label
                 map, we propose a novel loss term for the network. In
                 detail, two graph model-based spatial affinity matrixes
                 are proposed, which are used to depict the pixel-level
                 relationships in the input image and predicted category
                 label map respectively, and then their cosine distance
                 is backward propagated to the network. The proposed
                 architecture, called spatial structure preserving
                 feature pyramid network, significantly improves the
                 spatial resolution of the predicted category label map
                 for semantic image segmentation. The proposed method
                 achieves state-of-the-art results on three public and
                 challenging datasets for semantic image segmentation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhang:2019:MFA,
  author =       "Junxuan Zhang and Haifeng Hu and Xinlong Lu",
  title =        "Moving Foreground-Aware Visual Attention and Key
                 Volume Mining for Human Action Recognition",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "74:1--74:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321511",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321511",
  abstract =     "Recently, many deep learning approaches have shown
                 remarkable progress on human action recognition.
                 However, it remains unclear how to extract the useful
                 information in videos since only video-level labels are
                 available in the training phase. To address this
                 limitation, many efforts have been made to improve the
                 performance of action recognition by applying the
                 visual attention mechanism in the deep learning model.
                 In this article, we propose a novel deep model called
                 Moving Foreground Attention (MFA) that enhances the
                 performance of action recognition by guiding the model
                 to focus on the discriminative foreground targets. In
                 our work, MFA detects the moving foreground through a
                 proposed variance-based algorithm. Meanwhile, an
                 unsupervised proposal is utilized to mine the
                 action-related key volumes and generate corresponding
                 correlation scores. Based on these scores, a newly
                 proposed stochastic-out scheme is exploited to train
                 the MFA. Experiment results show that action
                 recognition performance can be significantly improved
                 by using our proposed techniques, and our model
                 achieves state-of-the-art performance on UCF101 and
                 HMDB51.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{More:2019:PLA,
  author =       "Amit More and Subhasis Chaudhuri",
  title =        "A Pseudo-likelihood Approach for Geo-localization of
                 Events from Crowd-sourced Sensor-Metadata",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "75:1--75:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321701",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321701",
  abstract =     "Events such as live concerts, protest marches, and
                 exhibitions are often video recorded by many people at
                 the same time, typically using smartphone devices. In
                 this work, we address the problem of geo-localizing
                 such events from crowd-generated data. Traditional
                 approaches for solving such a problem using multiple
                 video sequences of the event would require highly
                 complex computer vision (CV) methods, which are
                 computation intensive and are not robust under the
                 environment where visual data are collected through
                 crowd-sourced medium. In the present work, we approach
                 the problem in a probabilistic framework using only the
                 sensor metadata obtained from smartphones. We model the
                 event location and camera locations and orientations
                 (camera parameters) as the hidden states in a Hidden
                 Markov Model. The sensor metadata from GPS and the
                 digital compass from user smartphones are used as the
                 observations associated with the hidden states of the
                 model. We have used a suitable potential function to
                 capture the complex interaction between the hidden
                 states (i.e., event location and camera parameters).
                 The non-Gaussian densities involved in the model, such
                 as the potential function involving hidden states, make
                 the maximum-likelihood estimation intractable. We
                 propose a pseudo-likelihood-based approach to maximize
                 the approximate-likelihood, which provides a tractable
                 solution to the problem. The experimental results on
                 the simulated as well as real data show correct event
                 geo-localization using the proposed method. When
                 compared with several baselines the proposed method
                 shows a superior performance. The overall computation
                 time required is much smaller, since only the sensor
                 metadata are used instead of visual data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shah:2019:PCB,
  author =       "Mohsin Shah and Weiming Zhang and Honggang Hu and
                 Nenghai Yu",
  title =        "{Paillier} Cryptosystem based Mean Value Computation
                 for Encrypted Domain Image Processing Operations",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "76:1--76:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325194",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3325194",
  abstract =     "Due to its large storage facility and high-end
                 computing capability, cloud computing has received
                 great attention as a huge amount of personal multimedia
                 data and computationally expensive tasks can be
                 outsourced to the cloud. However, the cloud being
                 third-party semi-trusted, is prone to information
                 leakage, raising privacy risks. Signal processing in
                 the encrypted domain has emerged as a new research
                 paradigm on privacy-preserving processing over
                 outsourced data by semi-trusted cloud. In this article,
                 we propose a solution for non-integer mean value
                 computation in the homomorphic encrypted domain without
                 any interactive protocol between the client and the
                 service provider. Using the proposed solution, various
                 image processing operations, such as local smoothing
                 filter, un-sharp masking, and histogram equalization,
                 can be performed in the encrypted domain at the cloud
                 server without any privacy concerns. Our experimental
                 results from standard test images reveal that these
                 image processing operations can be performed without
                 pre-processing, without client-server interactive
                 protocol, and without any error between the encrypted
                 domain and the plain domain.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yue:2019:SRS,
  author =       "Guanghui Yue and Chunping Hou and Tianwei Zhou",
  title =        "Subtitle Region Selection of {S$3$D} Images in
                 Consideration of Visual Discomfort and Viewing Habit",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "77:1--77:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325197",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3325197",
  abstract =     "Subtitles, serving as a linguistic approximation of
                 the visual content, are an essential element in
                 stereoscopic advertisement and the film industry. Due
                 to the vergence accommodation conflict, the
                 stereoscopic 3D (S3D) subtitle inevitably causes visual
                 discomfort. To meet the viewing experience, the
                 subtitle region should be carefully arranged.
                 Unfortunately, very few works have been dedicated to
                 this area. In this article, we propose a method for S3D
                 subtitle region selection in consideration of visual
                 discomfort and viewing habit. First, we divide the
                 disparity map into multiple depth layers according to
                 the disparity value. The preferential processed depth
                 layer is determined by considering the disparity value
                 of the foremost object. Second, the optimal region and
                 coarse disparity value for S3D subtitle insertion are
                 chosen by convolving the selective depth layer with the
                 mean filter. Specifically, the viewing habit is
                 considered during the region selection. Finally, after
                 region selection, the disparity value of the subtitle
                 is further modified by using the just noticeable depth
                 difference (JNDD) model. Given that there is no public
                 database reported for the evaluation of S3D subtitle
                 insertion, we collect 120 S3D images as the test
                 platform. Both objective and subjective experiments are
                 conducted to evaluate the comfort degree of the
                 inserted subtitle. Experimental results demonstrate
                 that the proposed method can obtain promising
                 performance in improving the viewing experience of the
                 inserted subtitle.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2019:LCB,
  author =       "Yehao Li and Yingwei Pan and Ting Yao and Hongyang
                 Chao and Yong Rui and Tao Mei",
  title =        "Learning Click-Based Deep Structure-Preserving
                 Embeddings with Visual Attention",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "78:1--78:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328994",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328994",
  abstract =     "One fundamental problem in image search is to learn
                 the ranking functions (i.e., the similarity between
                 query and image). Recent progress on this topic has
                 evolved through two paradigms: the text-based model and
                 image ranker learning. The former relies on image
                 surrounding texts, making the similarity sensitive to
                 the quality of textual descriptions. The latter may
                 suffer from the robustness problem when human-labeled
                 query-image pairs cannot represent user search intent
                 precisely. We demonstrate in this article that the
                 preceding two limitations can be well mitigated by
                 learning a cross-view embedding that leverages click
                 data. Specifically, a novel click-based Deep
                 Structure-Preserving Embeddings with visual Attention
                 (DSPEA) model is presented, which consists of two
                 components: deep convolutional neural networks followed
                 by image embedding layers for learning visual
                 embedding, and a deep neural networks for generating
                 query semantic embedding. Meanwhile, visual attention
                 is incorporated at the top of the convolutional neural
                 network to reflect the relevant regions of the image to
                 the query. Furthermore, considering the high dimension
                 of the query space, a new click-based representation on
                 a query set is proposed for alleviating this sparsity
                 problem. The whole network is end-to-end trained by
                 optimizing a large margin objective that combines
                 cross-view ranking constraints with in-view
                 neighborhood structure preservation constraints. On a
                 large-scale click-based image dataset with 11.7 million
                 queries and 1 million images, our model is shown to be
                 powerful for keyword-based image search with superior
                 performance over several state-of-the-art methods and
                 achieves, to date, the best reported NDCG@25 of
                 52.21\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Cao:2019:SOG,
  author =       "Tengfei Cao and Changqiao Xu and Mu Wang and Zhongbai
                 Jiang and Xingyan Chen and Lujie Zhong and Luigi
                 Alfredo Grieco",
  title =        "Stochastic Optimization for Green Multimedia Services
                 in Dense {$5$G} Networks",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "79:1--79:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328996",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328996",
  abstract =     "The manyfold capacity magnification promised by dense
                 5G networks will make possible the provisioning of
                 broadband multimedia services, including virtual
                 reality, augmented reality, and mobile immersive video,
                 to name a few. These new applications will coexist with
                 classic ones and contribute to the exponential growth
                 of multimedia services in mobile networks. At the same
                 time, the different requirements of past and old
                 services pose new challenges to the effective usage of
                 5G resources. In response to these challenges, a novel
                 Stochastic Optimization framework for Green Multimedia
                 Services named SOGMS is proposed herein that targets
                 the maximization of system throughput and the
                 minimization of energy consumption in data delivery. In
                 particular, Lyapunov optimization is leveraged to face
                 this optimization objective, which is formulated and
                 decomposed into three tractable subproblems. For each
                 subproblem, a distinct algorithm is conceived, namely
                 quality of experience--based admission control,
                 cooperative resource allocation, and multimedia
                 services scheduling. Finally, extensive simulations are
                 carried out to evaluate the proposed method against
                 state-of-art solutions in dense 5G networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wu:2019:PAT,
  author =       "Jie Wu and Haifeng Hu and Liang Yang",
  title =        "Pseudo-{$3$D} Attention Transfer Network with
                 Content-aware Strategy for Image Captioning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "80:1--80:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3336495",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3336495",
  abstract =     "In this article, we propose a novel Pseudo-3D
                 Attention Transfer network with Content-aware Strategy
                 (P3DAT-CAS) for the image captioning task. Our model is
                 composed of three parts: the Pseudo-3D Attention (P3DA)
                 network, the P3DA-based Transfer (P3DAT) network, and
                 the Content-aware Strategy (CAS). First, we propose
                 P3DA to take full advantage of three-dimensional (3D)
                 information in convolutional feature maps and capture
                 more details. Most existing attention-based models only
                 extract the 2D spatial representation from
                 convolutional feature maps to decide which area should
                 be paid more attention to. However, convolutional
                 feature maps are 3D and different channel features can
                 detect diverse semantic attributes associated with
                 images. P3DA is proposed to combine 2D spatial maps
                 with 1D semantic-channel attributes and generate more
                 informative captions. Second, we design the transfer
                 network to maintain and transfer the key previous
                 attention information. The traditional attention-based
                 approaches only utilize the current attention
                 information to predict words directly, whereas transfer
                 network is able to learn long-term attention
                 dependencies and explore global modeling pattern.
                 Finally, we present CAS to provide a more relevant and
                 distinct caption for each image. The captioning model
                 trained by maximum likelihood estimation may generate
                 the captions that have a weak correlation with image
                 contents, resulting in the cross-modal gap between
                 vision and linguistics. However, CAS is helpful to
                 convey the meaningful visual contents accurately.
                 P3DAT-CAS is evaluated on Flickr30k and MSCOCO, and it
                 achieves very competitive performance among the
                 state-of-the-art models.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2019:DSS,
  author =       "Min Wang and Wengang Zhou and Qi Tian and Houqiang
                 Li",
  title =        "Deep Scalable Supervised Quantization by
                 Self-Organizing Map",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "81:1--81:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328995",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328995",
  abstract =     "Approximate Nearest Neighbor (ANN) search is an
                 important research topic in multimedia and computer
                 vision fields. In this article, we propose a new deep
                 supervised quantization method by Self-Organizing Map
                 to address this problem. Our method integrates the
                 Convolutional Neural Networks and Self-Organizing Map
                 into a unified deep architecture. The overall training
                 objective optimizes supervised quantization loss as
                 well as classification loss. With the supervised
                 quantization objective, we minimize the differences on
                 the maps between similar image pairs and maximize the
                 differences on the maps between dissimilar image pairs.
                 By optimization, the deep architecture can
                 simultaneously extract deep features and quantize the
                 features into suitable nodes in self-organizing map. To
                 make the proposed deep supervised quantization method
                 scalable for large datasets, instead of constructing a
                 larger self-organizing map, we propose to divide the
                 input space into several subspaces and construct
                 self-organizing map in each subspace. The
                 self-organizing maps in all the subspaces implicitly
                 construct a large self-organizing map, which costs less
                 memory and training time than directly constructing a
                 self-organizing map with equal size. The experiments on
                 several public standard datasets prove the superiority
                 of our approaches over the existing ANN search methods.
                 Besides, as a by-product, our deep architecture can be
                 directly applied to visualization with little
                 modification, and promising performance is demonstrated
                 in the experiments.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ozcelik:2019:CDA,
  author =       "Ihsan Mert Ozcelik and Cem Ersoy",
  title =        "Chunk Duration-Aware {SDN}-Assisted {DASH}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "82:1--82:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337681",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3337681",
  abstract =     "Although Dynamic Adaptive Streaming over HTTP (DASH)
                 is the pillar of multimedia content delivery
                 mechanisms, its purely client-based adaptive video
                 bitrate mechanisms have quality-of-experience fairness
                 and stability problems in the existence of multiple
                 DASH clients and highly fluctuating background traffic
                 on the same shared bottleneck link. Varying chunk
                 duration among different titles of multiple video
                 providers exacerbates this problem. With the help of
                 the global network view provided by the
                 software-defined networking paradigm, we propose a
                 centralized joint optimization module-assisted adaptive
                 video bitrate mechanism that takes diversity of chunk
                 sizes among different content into account. Our system
                 collects possible video bitrate levels and chunk
                 duration from DASH clients and simply calculates the
                 optimal video bitrates per client based on the
                 available capacity and chunk duration of each client's
                 selected content while not invading users' privacy. By
                 continuously following the background traffic flows, it
                 asynchronously updates the target video bitrate levels
                 to avoid both buffer stall events and network
                 underutilization issues rather than bandwidth slicing,
                 which brings about scalability problems in practice. It
                 also guarantees fair startup delays for video sessions
                 with various chunk duration. Our experiments clearly
                 show that our proposed approach considering diversity
                 of chunk duration and that background traffic
                 fluctuations can significantly provide a better and
                 fair quality of experience in terms of structural
                 similarity--based video quality and startup delay
                 compared to both purely client-based and
                 state-of-the-art software-defined networking--based
                 adaptive bitrate mechanisms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhuang:2019:RCI,
  author =       "Naifan Zhuang and Guo-Jun Qi and The Duc Kieu and Kien
                 A. Hua",
  title =        "Rethinking the Combined and Individual Orders of
                 Derivative of States for Differential Recurrent Neural
                 Networks: Deep Differential Recurrent Neural Networks",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "83:1--83:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337928",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3337928",
  abstract =     "Due to their special gating schemes, Long Short-Term
                 Memory (LSTM) has shown greater potential to process
                 complex sequential information than the traditional
                 Recurrent Neural Network (RNN). The conventional LSTM,
                 however, fails to take into consideration the impact of
                 salient spatio-temporal dynamics present in the
                 sequential input data. This problem was first addressed
                 by the differential Recurrent Neural Network (dRNN),
                 which uses a differential gating scheme known as
                 Derivative of States (DoS). DoS uses higher orders of
                 internal state derivatives to analyze the change in
                 information gain originated from the salient motions
                 between the successive frames. The weighted combination
                 of several orders of DoS is then used to modulate the
                 gates in dRNN. While each individual order of DoS is
                 good at modeling a certain level of salient
                 spatio-temporal sequences, the sum of all the orders of
                 DoS could distort the detected motion patterns. To
                 address this problem, we propose to control the LSTM
                 gates via individual orders of DoS. To fully utilize
                 the different orders of DoS, we further propose to
                 stack multiple levels of LSTM cells in an increasing
                 order of state derivatives. The proposed model
                 progressively builds up the ability of the LSTM gates
                 to detect salient dynamical patterns in deeper stacked
                 layers modeling higher orders of DoS; thus, the
                 proposed LSTM model is termed deep differential
                 Recurrent Neural Network (d$^2$ RNN). The effectiveness
                 of the proposed model is demonstrated on three publicly
                 available human activity datasets: NUS-HGA,
                 Violent-Flows, and UCF101. The proposed model
                 outperforms both LSTM and non-LSTM based
                 state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Wang:2019:EBD,
  author =       "Zhangcheng Wang and Ya Li and Richang Hong and Xinmei
                 Tian",
  title =        "Eigenvector-Based Distance Metric Learning for Image
                 Classification and Retrieval",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3",
  pages =        "84:1--84:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3340262",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Oct 2 10:12:47 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3340262",
  abstract =     "Distance metric learning has been widely studied in
                 multifarious research fields. The mainstream approaches
                 learn a Mahalanobis metric or learn a linear
                 transformation. Recent related works propose learning a
                 linear combination of base vectors to approximate the
                 metric. In this way, fewer variables need to be
                 determined, which is efficient when facing
                 high-dimensional data. Nevertheless, such works obtain
                 base vectors using additional data from related domains
                 or randomly generate base vectors. However, obtaining
                 base vectors from related domains requires extra time
                 and additional data, and random vectors introduce
                 randomness into the learning process, which requires
                 sufficient random vectors to ensure the stability of
                 the algorithm. Moreover, the random vectors cannot
                 capture the rich information of the training data,
                 leading to a degradation in performance. Considering
                 these drawbacks, we propose a novel distance metric
                 learning approach by introducing base vectors
                 explicitly learned from training data. Given a specific
                 task, we can make a sparse approximation of its
                 objective function using the top eigenvalues and
                 corresponding eigenvectors of a predefined integral
                 operator on the reproducing kernel Hilbert space.
                 Because the process of generating eigenvectors simply
                 refers to the training data of the considered task, our
                 proposed method does not require additional data and
                 can reflect the intrinsic information of the input
                 features. Furthermore, the explicitly learned
                 eigenvectors do not result in randomness, and we can
                 extend our method to any kernel space without changing
                 the objective function. We only need to learn the
                 coefficients of these eigenvectors, and the only
                 hyperparameter that we need to determine is the number
                 of eigenvectors that we utilize. Additionally, an
                 optimization algorithm is proposed to efficiently solve
                 this problem. Extensive experiments conducted on
                 several datasets demonstrate the effectiveness of our
                 proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Pala:2020:ISI,
  author =       "Pietro Pala and Liming Chen and Di Huang and Xiaoming
                 Liu and Stefanos Zafeiriou",
  title =        "Introduction to the Special Issue on Face Analysis
                 Applications",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--2",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359624",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359624",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2020:UTB,
  author =       "Zhen-Hua Feng and Josef Kittler and Bill Christmas and
                 Xiao-Jun Wu",
  title =        "A Unified Tensor-based Active Appearance Model",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--22",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3338841",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3338841",
  abstract =     "Appearance variations result in many difficulties in
                 face image analysis. To deal with this challenge, we
                 present a Unified Tensor-based Active Appearance Model
                 (UT-AAM) for jointly modelling the geometry and texture
                 information of 2D faces. For each \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shamai:2020:SFP,
  author =       "Gil Shamai and Ron Slossberg and Ron Kimmel",
  title =        "Synthesizing Facial Photometries and Corresponding
                 Geometries Using Generative Adversarial Networks",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337067",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3337067",
  abstract =     "Artificial data synthesis is currently a well-studied
                 topic with useful applications in data science,
                 computer vision, graphics, and many other fields.
                 Generating realistic data is especially challenging,
                 since human perception is highly sensitive to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:UNC,
  author =       "Xueping Wang and Yunhong Wang and Weixin Li",
  title =        "{U-Net} Conditional {GANs} for Photo-Realistic and
                 Identity-Preserving Facial Expression Synthesis",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355397",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3355397",
  abstract =     "Facial expression synthesis (FES) is a challenging
                 task since the expression changes are highly non-linear
                 and depend on the facial appearance. Person identity
                 should also be well preserved in the synthesized face.
                 In this article, we present a novel U- \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2020:EFA,
  author =       "Zhiwei Liu and Xiangyu Zhu and Ming Tang and Zhen Lei
                 and Jinqiao Wang",
  title =        "Efficient Face Alignment with Fast Normalization and
                 Contour Fitting Loss",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--16",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3338842",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3338842",
  abstract =     "Face alignment is a key component of numerous face
                 analysis tasks. In recent years, most existing methods
                 have focused on designing high-performance face
                 alignment systems and paid less attention to
                 efficiency. However more face alignment systems are
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duan:2020:VAA,
  author =       "Huiyu Duan and Xiongkuo Min and Yi Fang and Lei Fan
                 and Xiaokang Yang and Guangtao Zhai",
  title =        "Visual Attention Analysis and Prediction on Human
                 Faces for Children with Autism Spectrum Disorder",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337066",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3337066",
  abstract =     "The focus of this article is to analyze and predict
                 the visual attention of children with Autism Spectrum
                 Disorder (ASD) when looking at human faces. Social
                 difficulties are the hallmark features of ASD and will
                 lead to atypical visual attention toward \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duan:2020:FEM,
  author =       "Mingxing Duan and Kenli Li and Xiangke Liao and Keqin
                 Li and Qi Tian",
  title =        "Features-Enhanced Multi-Attribute Estimation with
                 Convolutional Tensor Correlation Fusion Network",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355542",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3355542",
  abstract =     "To achieve robust facial attribute estimation, a
                 hierarchical prediction system referred to as tensor
                 correlation fusion network (TCFN) is proposed for
                 attribute estimation. The system includes feature
                 extraction, correlation excavation among facial
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2020:ISI,
  author =       "Sicheng Zhao and Dhiraj Joshi and Mohammad Soleymani
                 and Qiang Ji",
  title =        "Introduction to the Special Issue on Affective
                 Computing for Large-scale Heterogeneous Multimedia
                 Data",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--2",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365845",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365845",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2020:ACL,
  author =       "Sicheng Zhao and Shangfei Wang and Mohammad Soleymani
                 and Dhiraj Joshi and Qiang Ji",
  title =        "Affective Computing for Large-scale Heterogeneous
                 Multimedia Data: a Survey",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--32",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3363560",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3363560",
  abstract =     "The wide popularity of digital photography and social
                 networks has generated a rapidly growing volume of
                 multimedia data (i.e., images, music, and videos),
                 resulting in a great demand for managing, retrieving,
                 and understanding these data. Affective \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hong:2020:CSF,
  author =       "Xiaopeng Hong and Wei Peng and Mehrtash Harandi and
                 Ziheng Zhou and Matti Pietik{\"a}inen and Guoying
                 Zhao",
  title =        "Characterizing Subtle Facial Movements via
                 {Riemannian} Manifold",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342227",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342227",
  abstract =     "Characterizing subtle facial movements from videos is
                 one of the most intensive topics in computer vision
                 research. It is, however, challenging, since (1) the
                 intensity of subtle facial muscle movement is usually
                 low, (2) the duration may be transient, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2020:PSB,
  author =       "Junjie Zhu and Yuxuan Wei and Yifan Feng and Xibin
                 Zhao and Yue Gao",
  title =        "Physiological Signals-based Emotion Recognition via
                 High-order Correlation Learning",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--18",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3332374",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3332374",
  abstract =     "Emotion recognition by physiological signals is an
                 effective way to discern the inner state of human
                 beings and therefore has been widely adopted in many
                 user-centered applications. The majority of current
                 state-of-the-art methods focus on exploring \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{She:2020:LDS,
  author =       "Dongyu She and Ming Sun and Jufeng Yang",
  title =        "Learning Discriminative Sentiment Representation from
                 Strongly- and Weakly Supervised {CNNs}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--19",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3326335",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3326335",
  abstract =     "Visual sentiment analysis is attracting increasing
                 attention with the rapidly growing amount of images
                 uploaded to social networks. Learning rich visual
                 representations often requires training deep
                 convolutional neural networks (CNNs) on massive
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2020:HCR,
  author =       "Liang Li and Xinge Zhu and Yiming Hao and Shuhui Wang
                 and Xingyu Gao and Qingming Huang",
  title =        "A Hierarchical {CNN-RNN} Approach for Visual Emotion
                 Classification",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--17",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359753",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359753",
  abstract =     "Visual emotion classification is predicting emotional
                 reactions of people for the given visual content.
                 Psychological studies show that human emotions are
                 affected by various visual stimuli from low level to
                 high level, including contrast, color, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2020:ASC,
  author =       "Liang Yang and Yuexue Wang and Junhua Gu and Xiaochun
                 Cao and Xiao Wang and Di Jin and Guiguang Ding and
                 Jungong Han and Weixiong Zhang",
  title =        "Autonomous Semantic Community Detection via Adaptively
                 Weighted Low-rank Approximation",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--22",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355393",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3355393",
  abstract =     "Identification of semantic community structures is
                 important for understanding the interactions and
                 sentiments of different groups of people and predicting
                 the social emotion. A robust community detection method
                 needs to autonomously determine the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hou:2020:SDE,
  author =       "Yuxin Hou and Hongxun Yao and Xiaoshuai Sun and Haoran
                 Li",
  title =        "{Soul Dancer}: Emotion-Based Human Action Generation",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--19",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3340463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3340463",
  abstract =     "Body language is one of the most common ways of
                 expressing human emotion. In this article, we make the
                 first attempt to generate an action video with a
                 specific emotion from a single person image. The goal
                 of the emotion-based action generation task \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2020:ACA,
  author =       "Shenghong Hu and Min Xu and Haimin Zhang and Chunxia
                 Xiao and Chao Gui",
  title =        "Affective Content-aware Adaptation Scheme on {QoE}
                 Optimization of Adaptive Streaming over {HTTP}",
  journal =      j-TOMM,
  volume =       "15",
  number =       "3s",
  pages =        "1--18",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328997",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jan 23 07:04:18 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3328997",
  abstract =     "The article presents a novel affective content-aware
                 adaptation scheme (ACAA) to optimize Quality of
                 Experience (QoE) for dynamic adaptive video streaming
                 over HTTP (DASH). Most of the existing DASH adaptation
                 schemes conduct video bit-rate adaptation \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nie:2020:HHG,
  author =       "Weizhi Nie and Weijie Wang and Anan Liu and Yuting Su
                 and Jie Nie",
  title =        "{HGAN}: Holistic Generative Adversarial Networks for
                 Two-dimensional Image-based Three-dimensional Object
                 Retrieval",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3344684",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3344684",
  abstract =     "In this article, we propose a novel method to address
                 the two-dimensional (2D) image-based 3D object
                 retrieval problem. First, we extract a set of virtual
                 views to represent each 3D object. Then, a
                 soft-attention model is utilized to find the weight of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Li:2020:IVR,
  author =       "Mading Li and Jiaying Liu and Xiaoyan Sun and Zhiwei
                 Xiong",
  title =        "Image\slash Video Restoration via Multiplanar
                 Autoregressive Model and Low-Rank Optimization",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3341728",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3341728",
  abstract =     "In this article, we introduce an image/video
                 restoration approach by utilizing the high-dimensional
                 similarity in images/videos. After grouping similar
                 patches from neighboring frames, we propose to build a
                 multiplanar autoregressive (AR) model to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zhong:2020:SDM,
  author =       "Sheng-Hua Zhong and Yuantian Wang and Tongwei Ren and
                 Mingjie Zheng and Yan Liu and Gangshan Wu",
  title =        "Steganographer Detection via Multi-Scale Embedding
                 Probability Estimation",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3352691",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3352691",
  abstract =     "Steganographer detection aims to identify the guilty
                 user who utilizes steganographic methods to hide secret
                 information in the spread of multimedia data,
                 especially image data, from a large amount of innocent
                 users on social networks. A true embedding \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{deAlmeida:2020:RPS,
  author =       "Marcos Alves de Almeida and Carolina Coimbra Vieira
                 and Pedro Olmo Stancioli {Vaz De Melo} and Renato
                 Martins Assun{\c{c}}{\~a}o",
  title =        "Random Playlists Smoothly Commuting Between Styles",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--20",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3361742",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3361742",
  abstract =     "Someone enjoys listening to playlists while commuting.
                 He wants a different playlist of n songs each day, but
                 always starting from Locked Out of Heaven, a Bruno Mars
                 song. The list should progress in smooth transitions
                 between successive and randomly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "104",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Ye:2020:SCM,
  author =       "Zhaoda Ye and Yuxin Peng",
  title =        "Sequential Cross-Modal Hashing Learning via
                 Multi-scale Correlation Mining",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--20",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3356338",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3356338",
  abstract =     "Cross-modal hashing aims to map heterogeneous
                 multimedia data into a common Hamming space through
                 hash function, and achieves fast and flexible
                 cross-modal retrieval. Most existing cross-modal
                 hashing methods learn hash function by mining the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "105",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2020:EIH,
  author =       "Shiguang Liu and Ziqing Huang",
  title =        "Efficient Image Hashing with Geometric Invariant
                 Vector Distance for Copy Detection",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--22",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3355394",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3355394",
  abstract =     "Hashing method is an efficient technique of multimedia
                 security for content protection. It maps an image into
                 a content-based compact code for denoting the image
                 itself. While most existing algorithms focus on
                 improving the classification between \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "106",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2020:LAB,
  author =       "Zhandong Liu and Wengang Zhou and Houqiang Li",
  title =        "{AB-LSTM}: Attention-based Bidirectional {LSTM} Model
                 for Scene Text Detection",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3356728",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3356728",
  abstract =     "Detection of scene text in arbitrary shapes is a
                 challenging task in the field of computer vision. Most
                 existing scene text detection methods exploit the
                 rectangle/quadrangular bounding box to denote the
                 detected text, which fails to accurately fit text
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "107",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Bhowmik:2020:EDA,
  author =       "Deepayan Bhowmik and Charith Abhayaratne",
  title =        "Embedding Distortion Analysis in Wavelet-domain
                 Watermarking",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3357333",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357333",
  abstract =     "Imperceptibility and robustness are two complementary
                 fundamental requirements of any watermarking algorithm.
                 Low-strength watermarking yields high imperceptibility,
                 but exhibits poor robustness. High-strength
                 watermarking schemes achieve good \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "108",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shen:2020:VRS,
  author =       "Ling Shen and Richang Hong and Haoran Zhang and Xinmei
                 Tian and Meng Wang",
  title =        "Video Retrieval with Similarity-Preserving Deep
                 Temporal Hashing",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--16",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3356316",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3356316",
  abstract =     "Despite the fact that remarkable progress has been
                 made in recent years, Content-based Video Retrieval
                 (CBVR) is still an appealing research topic due to
                 increasing search demands in the Internet era of big
                 data. This article aims to explore an \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "109",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{VanderHooft:2020:TBA,
  author =       "Jeroen {Van der Hooft} and Maria {Torres Vega} and
                 Stefano Petrangeli and Tim Wauters and Filip {De
                 Turck}",
  title =        "Tile-based Adaptive Streaming for Virtual Reality
                 Video",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3362101",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3362101",
  abstract =     "The increasing popularity of head-mounted devices and
                 360${}^\circ $ video cameras allows content providers
                 to provide virtual reality (VR) video streaming over
                 the Internet, using a two-dimensional representation of
                 the immersive content combined with \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "110",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Filho:2020:DPV,
  author =       "Roberto Iraja Tavares Da Costa Filho and Marcelo
                 Caggiani Luizelli and Stefano Petrangeli and Maria
                 Torres Vega and Jeroen {Van der Hooft} and Tim Wauters
                 and Filip {De Turck} and Luciano Paschoal Gaspary",
  title =        "Dissecting the Performance of {VR} Video Streaming
                 through the {VR-EXP} Experimentation Platform",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--23",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3360286",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360286",
  abstract =     "To cope with the massive bandwidth demands of Virtual
                 Reality (VR) video streaming, both the scientific
                 community and the industry have been proposing
                 optimization techniques such as viewport-aware
                 streaming and tile-based adaptive bitrate heuristics.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "111",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Zheng:2020:ULH,
  author =       "Yunpeng Zheng and Xuelong Li and Xiaoqiang Lu",
  title =        "Unsupervised Learning of Human Action Categories in
                 Still Images with Deep Representations",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--20",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3362161",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3362161",
  abstract =     "In this article, we propose a novel method for
                 unsupervised learning of human action categories in
                 still images. In contrast to previous methods, the
                 proposed method explores distinctive information of
                 actions directly from unlabeled image databases,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "112",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Xing:2020:ICC,
  author =       "Meng Xing and Zhiyong Feng and Yong Su and Jianhai
                 Zhang",
  title =        "An Image Cues Coding Approach for {$3$D} Human Pose
                 Estimation",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--20",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3368066",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3368066",
  abstract =     "Although Deep Convolutional Neural Networks (DCNNs)
                 facilitate the evolution of 3D human pose estimation,
                 ambiguity remains the most challenging problem in such
                 tasks. Inspired by the Human Perception Mechanism
                 (HPM), we propose an image-to-pose coding \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "113",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Liu:2020:EEA,
  author =       "Jinhuan Liu and Xuemeng Song and Liqiang Nie and Tian
                 Gan and Jun Ma",
  title =        "An End-to-End Attention-Based Neural Model for
                 Complementary Clothing Matching",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--16",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3368071",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3368071",
  abstract =     "In modern society, people tend to prefer fashionable
                 and decent outfits that can meet more than basic
                 physiological needs. In fact, a proper outfit usually
                 relies on good matching among complementary fashion
                 items (e.g., the top, bottom, and shoes) that
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "114",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Kua:2020:ACA,
  author =       "Jonathan Kua and Grenville Armitage and Philip Branch
                 and Jason But",
  title =        "Adaptive Chunklets and {AQM} for Higher-Performance
                 Content Streaming",
  journal =      j-TOMM,
  volume =       "15",
  number =       "4",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3344381",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 11 08:35:19 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3344381",
  abstract =     "Commercial streaming services such as Netflix and
                 YouTube use proprietary HTTP-based adaptive streaming
                 (HAS) techniques to deliver content to consumers
                 worldwide. MPEG recently developed Dynamic Adaptive
                 Streaming over HTTP (DASH) as a unifying \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "115",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Chen:2020:LLF,
  author =       "Bin Chen and Lingyan Ruan and Miu-Ling Lam",
  title =        "{LFGAN}: {$4$D} Light Field Synthesis from a Single
                 {RGB} Image",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "2:1--2:20",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3366371",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3366371",
  abstract =     "We present a deep neural network called the light
                 field generative adversarial network (LFGAN) that
                 synthesizes a 4D light field from a single 2D RGB
                 image. We generate light fields using a single image
                 super-resolution (SISR) technique based on two
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ding:2020:AEU,
  author =       "Yuhang Ding and Hehe Fan and Mingliang Xu and Yi
                 Yang",
  title =        "Adaptive Exploration for Unsupervised Person
                 Re-identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "3:1--3:19",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3369393",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3369393",
  abstract =     "Due to domain bias, directly deploying a deep person
                 re-identification (re-ID) model trained on one dataset
                 often achieves considerably poor accuracy on another
                 dataset. In this article, we propose an Adaptive
                 Exploration (AE) method to address the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bentaleb:2020:DDQ,
  author =       "Abdelhak Bentaleb and Praveen Kumar Yadav and Wei
                 Tsang Ooi and Roger Zimmermann",
  title =        "{DQ-DASH}: a Queuing Theory Approach to Distributed
                 Adaptive Video Streaming",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3371040",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3371040",
  abstract =     "The significant popularity of HTTP adaptive video
                 streaming (HAS), such as Dynamic Adaptive Streaming
                 over HTTP (DASH), over the Internet has led to a stark
                 increase in user expectations in terms of video quality
                 and delivery robustness. This situation \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2020:RHR,
  author =       "Xin Huang and Yuxin Peng and Zhang Wen",
  title =        "{RCE-HIL}: Recognizing Cross-media Entailment with
                 Heterogeneous Interactive Learning",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "5:1--5:21",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365003",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365003",
  abstract =     "Entailment recognition is an important paradigm of
                 reasoning that judges if a hypothesis can be inferred
                 from given premises. However, previous efforts mainly
                 concentrate on text-based reasoning as recognizing
                 textual entailment (RTE), where the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2020:CRT,
  author =       "Miaopeng Li and Zimeng Zhou and Xinguo Liu",
  title =        "Cross Refinement Techniques for Markerless Human
                 Motion Capture",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "6:1--6:18",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372207",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372207",
  abstract =     "This article presents a global 3D human pose
                 estimation method for markerless motion capture. Given
                 two calibrated images of a person, it first obtains the
                 2D joint locations in the images using a pre-trained 2D
                 Pose CNN, then constructs the 3D pose \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Illahi:2020:CGF,
  author =       "Gazi Karam Illahi and Thomas {Van Gemert} and Matti
                 Siekkinen and Enrico Masala and Antti Oulasvirta and
                 Antti Yl{\"a}-J{\"a}{\"a}ski",
  title =        "Cloud Gaming with Foveated Video Encoding",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "7:1--7:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3369110",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3369110",
  abstract =     "Cloud gaming enables playing high-end games,
                 originally designed for PC or game console setups, on
                 low-end devices such as netbooks and smartphones, by
                 offloading graphics rendering to GPU-powered cloud
                 servers. However, transmitting the high-resolution
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nguyen:2020:ETS,
  author =       "Duc V. Nguyen and Huyen T. T. Tran and Truong Cong
                 Thang",
  title =        "An Evaluation of Tile Selection Methods for
                 Viewport-Adaptive Streaming of 360-Degree Video",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "8:1--8:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3373359",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3373359",
  abstract =     "360-degree video has become increasingly popular
                 nowadays. For effective transmission of
                 bandwidth-intensive 360-degree video over networks,
                 viewport-adaptive streaming has been introduced. In
                 this article, we evaluate, for the first time, ten
                 existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2020:LSS,
  author =       "Zhenguo Yang and Zehang Lin and Peipei Kang and
                 Jianming LV and Qing Li and Wenyin Liu",
  title =        "Learning Shared Semantic Space with Correlation
                 Alignment for Cross-Modal Event Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "9:1--9:22",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3374754",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374754",
  abstract =     "In this article, we propose to learn shared semantic
                 space with correlation alignment ( S$^3$ CA ) for
                 multimodal data representations, which aligns nonlinear
                 correlations of multimodal data distributions in deep
                 neural networks designed for heterogeneous \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:JSH,
  author =       "Junfeng Zhang and Haifeng Hu and Guobin Shen",
  title =        "Joint Stacked Hourglass Network and Salient Region
                 Attention Refinement for Robust Face Alignment",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "10:1--10:18",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3374760",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374760",
  abstract =     "Facial landmark detection aims to locate keypoints for
                 facial images, which typically suffer from variations
                 caused by arbitrary pose, diverse facial expressions,
                 and partial occlusion. In this article, we propose a
                 coarse-to-fine framework that joins a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tasaka:2020:CSM,
  author =       "Shuji Tasaka",
  title =        "Causal Structures of Multidimensional {QoE} in
                 Haptic-Audiovisual Communications: {Bayesian}
                 Modeling",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "11:1--11:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3375922",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3375922",
  abstract =     "This article proposes a methodology for building and
                 verifying plausible models that can express causation
                 in multidimensional QoE for haptic-audiovisual
                 interactive communications. For the modeling, we
                 utilize subjective experimental data of five-point
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Punn:2020:IUN,
  author =       "Narinder Singh Punn and Sonali Agarwal",
  title =        "Inception {U-Net} Architecture for Semantic
                 Segmentation to Identify Nuclei in Microscopy Cell
                 Images",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "12:1--12:15",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3376922",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3376922",
  abstract =     "With the increasing applications of deep learning in
                 biomedical image analysis, in this article we introduce
                 an inception U-Net architecture for automating nuclei
                 detection in microscopy cell images of varying size and
                 modality to help unlock faster \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chaudhary:2020:IRC,
  author =       "Chandramani Chaudhary and Poonam Goyal and Navneet
                 Goyal and Yi-Ping Phoebe Chen",
  title =        "Image Retrieval for Complex Queries Using Knowledge
                 Embedding",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "13:1--13:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3375786",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3375786",
  abstract =     "With the increase in popularity of image-based
                 applications, users are retrieving images using more
                 sophisticated and complex queries. We present three
                 types of complex queries, namely, long, ambiguous, and
                 abstract. Each type of query has its own \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Luo:2020:STS,
  author =       "Guoliang Luo and Zhigang Deng and Xin Zhao and
                 Xiaogang Jin and Wei Zeng and Wenqiang Xie and Hyewon
                 Seo",
  title =        "Spatio-temporal Segmentation Based Adaptive
                 Compression of Dynamic Mesh Sequences",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "14:1--14:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377475",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377475",
  abstract =     "With the recent advances in data acquisition
                 techniques, the compression of various dynamic mesh
                 sequence data has become an important topic in the
                 computer graphics community. In this article, we
                 present a new spatio-temporal segmentation-based
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pan:2020:FLB,
  author =       "Zhaoqing Pan and Xiaokai Yi and Yun Zhang and Hui Yuan
                 and Fu Lee Wang and Sam Kwong",
  title =        "Frame-level Bit Allocation Optimization Based on Video
                 Content Characteristics for {HEVC}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "15:1--15:20",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380827",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380827",
  abstract =     "Rate control plays an important role in high
                 efficiency video coding (HEVC), and bit allocation is
                 the foundation of rate control. The video content
                 characteristics are significant for bit allocation, and
                 modeling an accurate relationship between video
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ainam:2020:EAF,
  author =       "Jean-Paul Ainam and Ke Qin and Guisong Liu and
                 Guangchun Luo and Brighter Agyemang",
  title =        "Enforcing Affinity Feature Learning through
                 Self-attention for Person Re-identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "16:1--16:22",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377352",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377352",
  abstract =     "Person re-identification is the task of recognizing an
                 individual across heterogeneous non-overlapping camera
                 views. It has become a crucial capability needed by
                 many applications in public space video surveillance.
                 However, it remains a challenging \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2020:DLA,
  author =       "Mengyan Li and Zhaoyu Zhang and Guochen Xie and Jun
                 Yu",
  title =        "A Deep Learning Approach for Face Hallucination Guided
                 by Facial Boundary Responses",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "17:1--17:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377874",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377874",
  abstract =     "Face hallucination is a domain-specific
                 super-resolution (SR) problem of learning a mapping
                 between a low-resolution (LR) face image and its
                 corresponding high-resolution (HR) image. Tremendous
                 progress on deep learning has shown exciting potential
                 for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2020:EDL,
  author =       "Zan Gao and Yinming Li and Shaohua Wan",
  title =        "Exploring Deep Learning for View-Based {$3$D} Model
                 Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1",
  pages =        "18:1--18:21",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377876",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Apr 6 09:23:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377876",
  abstract =     "In recent years, view-based 3D model retrieval has
                 become one of the research focuses in the field of
                 computer vision and machine learning. In fact, the 3D
                 model retrieval algorithm consists of feature
                 extraction and similarity measurement, and the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:ISI,
  author =       "Shengping Zhang and Huiyu Zhou and Dong Xu and M. Emre
                 Celebi and Thierry Bouwmans",
  title =        "Introduction to the Special Issue on Multimodal
                 Machine Learning for Human Behavior Analysis",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "19:1--19:2",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381917",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381917",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2020:RVT,
  author =       "Changyong Guo and Zhaoxin Zhang and Jinjiang Li and
                 Xuesong Jiang and Jun Zhang and Lei Zhang",
  title =        "Robust Visual Tracking Using Kernel Sparse Coding on
                 Multiple Covariance Descriptors",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "20:1--20:22",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3360308",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360308",
  abstract =     "In this article, we aim to improve the performance of
                 visual tracking by combing different features of
                 multiple modalities. The core idea is to use covariance
                 matrices as feature descriptors and then use sparse
                 coding to encode different features. The \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:CSO,
  author =       "Zhaoxin Zhang and Changyong Guo and Fanzhi Meng and
                 Taizhong Xu and Junkai Huang",
  title =        "{CovLets}: a Second-Order Descriptor for Modeling
                 Multiple Features",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "21:1--21:14",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357525",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357525",
  abstract =     "State-of-the-art techniques for image and video
                 classification take a bottom-up approach where local
                 features are aggregated into a global final
                 representation. Existing frameworks (i.e., bag of words
                 or Fisher vectors) are specifically designed to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Meng:2020:ARU,
  author =       "Quanling Meng and Heyan Zhu and Weigang Zhang and
                 Xuefeng Piao and Aijie Zhang",
  title =        "Action Recognition Using Form and Motion Modalities",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "22:1--22:16",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3350840",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3350840",
  abstract =     "Action recognition has attracted increasing interest
                 in computer vision due to its potential applications in
                 many vision systems. One of the main challenges in
                 action recognition is to extract powerful features from
                 videos. Most existing approaches \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shamsolmoali:2020:AAM,
  author =       "Pourya Shamsolmoali and Masoumeh Zareapoor and Huiyu
                 Zhou and Jie Yang",
  title =        "{AMIL}: Adversarial Multi-instance Learning for Human
                 Pose Estimation",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "23:1--23:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3355612",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3355612",
  abstract =     "Human pose estimation has an important impact on a
                 wide range of applications, from human-computer
                 interface to surveillance and content-based video
                 retrieval. For human pose estimation, joint
                 obstructions and overlapping upon human bodies result
                 in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhuang:2020:MAR,
  author =       "Yueting Zhuang and Dejing Xu and Xin Yan and Wenzhuo
                 Cheng and Zhou Zhao and Shiliang Pu and Jun Xiao",
  title =        "Multichannel Attention Refinement for Video Question
                 Answering",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "24:1--24:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3366710",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3366710",
  abstract =     "Video Question Answering (VideoQA) is the extension of
                 image question answering (ImageQA) in the video domain.
                 Methods are required to give the correct answer after
                 analyzing the provided video and question in this task.
                 Comparing to ImageQA, the most \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Grigorev:2020:DDD,
  author =       "Aleksei Grigorev and Shaohui Liu and Zhihong Tian and
                 Jianxin Xiong and Seungmin Rho and Jiang Feng",
  title =        "Delving Deeper in Drone-Based Person Re-Id by
                 Employing Deep Decision Forest and Attributes Fusion",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "25:1--25:15",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3360050",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360050",
  abstract =     "Deep learning has revolutionized the field of computer
                 vision and image processing. Its ability to extract the
                 compact image representation has taken the person
                 re-identification (re-id) problem to a new level.
                 However, in most cases, researchers are \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2020:SPG,
  author =       "Zhaoju Li and Zongwei Zhou and Nan Jiang and Zhenjun
                 Han and Junliang Xing and Jianbin Jiao",
  title =        "Spatial Preserved Graph Convolution Networks for
                 Person Re-identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "26:1--26:14",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3362988",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3362988",
  abstract =     "Person Re-identification is a very challenging task
                 due to inter-class ambiguity caused by similar
                 appearances, and large intra-class diversity caused by
                 viewpoints, illuminations, and poses. To address these
                 challenges, in this article, a graph \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2020:AAC,
  author =       "Hui Chen and Guiguang Ding and Zijia Lin and Sicheng
                 Zhao and Xiaopeng Gu and Wenyuan Xu and Jungong Han",
  title =        "{ACMNet}: Adaptive Confidence Matching Network for
                 Human Behavior Analysis via Cross-modal Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "27:1--27:21",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3362065",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3362065",
  abstract =     "Cross-modality human behavior analysis has attracted
                 much attention from both academia and industry. In this
                 article, we focus on the cross-modality image-text
                 retrieval problem for human behavior analysis, which
                 can learn a common latent space for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:MSS,
  author =       "Anran Zhang and Xiaolong Jiang and Baochang Zhang and
                 Xianbin Cao",
  title =        "Multi-scale Supervised Attentive Encoder--Decoder
                 Network for Crowd Counting",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "28:1--28:20",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3356019",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3356019",
  abstract =     "Crowd counting is a popular topic with widespread
                 applications. Currently, the biggest challenge to crowd
                 counting is large-scale variation in objects. In this
                 article, we focus on overcoming this challenge by
                 proposing a novel Attentive Encoder-Decoder \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tanveer:2020:ISI,
  author =       "M. Tanveer and P. Khanna and M. Prasad and C. T. Lin",
  title =        "Introduction to the Special Issue on Computational
                 Intelligence for Biomedical Data and Imaging",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "29:1--29:4",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381919",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381919",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tanveer:2020:MLT,
  author =       "M. Tanveer and B. Richhariya and R. U. Khan and A. H.
                 Rashid and P. Khanna and M. Prasad and C. T. Lin",
  title =        "Machine Learning Techniques for the Diagnosis of
                 {Alzheimer}'s Disease: a Review",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "30:1--30:35",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3344998",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3344998",
  abstract =     "Alzheimer's disease is an incurable neurodegenerative
                 disease primarily affecting the elderly population.
                 Efficient automated techniques are needed for early
                 diagnosis of Alzheimer's. Many novel approaches are
                 proposed by researchers for classification \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yadav:2020:EDA,
  author =       "Shweta Yadav and Pralay Ramteke and Asif Ekbal and
                 Sriparna Saha and Pushpak Bhattacharyya",
  title =        "Exploring Disorder-Aware Attention for Clinical Event
                 Extraction",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "31:1--31:21",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372328",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372328",
  abstract =     "Event extraction is one of the crucial tasks in
                 biomedical text mining that aims to extract specific
                 information concerning incidents embedded in the texts.
                 In this article, we propose a deep learning framework
                 that aims to identify the attributes \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tripathi:2020:CNC,
  author =       "Suvidha Tripathi and Satish Kumar Singh",
  title =        "Cell Nuclei Classification in Histopathological Images
                 using {Hybrid O L ConvNet}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "32:1--32:22",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345318",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345318",
  abstract =     "Computer-aided histopathological image analysis for
                 cancer detection is a major research challenge in the
                 medical domain. Automatic detection and classification
                 of nuclei for cancer diagnosis impose a lot of
                 challenges in developing state-of-the-art \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2020:DSS,
  author =       "Nengjun Zhu and Jian Cao and Kunwei Shen and Xiaosong
                 Chen and Siji Zhu",
  title =        "A Decision Support System with Intelligent
                 Recommendation for Multi-disciplinary Medical
                 Treatment",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "33:1--33:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3352573",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3352573",
  abstract =     "Recent years have witnessed an emerging trend for
                 improving disease treatment by forming
                 multi-disciplinary medical teams. The collaboration
                 among specialists from multiple medical domains has
                 been shown to be significantly helpful for designing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:RFS,
  author =       "Qingyong Wang and Yun Zhou and Weiping Ding and Zhiguo
                 Zhang and Khan Muhammad and Zehong Cao",
  title =        "Random Forest with Self-Paced Bootstrap Learning in
                 Lung Cancer Prognosis",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "34:1--34:12",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345314",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345314",
  abstract =     "Training gene expression data with supervised learning
                 approaches can provide an alarm sign for early
                 treatment of lung cancer to decrease death rates.
                 However, the samples of gene features involve lots of
                 noises in a realistic environment. In this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Saini:2020:TEB,
  author =       "Naveen Saini and Sriparna Saha and Pushpak
                 Bhattacharyya and Himanshu Tuteja",
  title =        "Textual Entailment-Based Figure Summarization for
                 Biomedical Articles",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "35:1--35:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357334",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357334",
  abstract =     "This article proposes a novel unsupervised approach
                 (FigSum++) for automatic figure summarization in
                 biomedical scientific articles using a multi-objective
                 evolutionary algorithm. The problem is treated as an
                 optimization problem where relevant \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tong:2020:PND,
  author =       "Chao Tong and Baoyu Liang and Mengze Zhang and
                 Rongshan Chen and Arun Kumar Sangaiah and Zhigao Zheng
                 and Tao Wan and Chenyang Yue and Xinyi Yang",
  title =        "Pulmonary Nodule Detection Based on {ISODATA}-Improved
                 Faster {RCNN} and {$3$D-CNN} with Focal Loss",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "36:1--36:9",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365445",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365445",
  abstract =     "The early diagnosis of pulmonary cancer can
                 significantly improve the survival rate of patients,
                 where pulmonary nodules detection in computed
                 tomography images plays an important role. In this
                 article, we propose a novel pulmonary nodule detection
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Agrawal:2020:HWB,
  author =       "Utkarsh Agrawal and Jatin Arora and Rahul Singh and
                 Deepak Gupta and Ashish Khanna and Aditya Khamparia",
  title =        "Hybrid Wolf--Bat Algorithm for Optimization of
                 Connection Weights in Multi-layer Perceptron",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "37:1--37:20",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3350532",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3350532",
  abstract =     "In a neural network, the weights act as parameters to
                 determine the output(s) from a set of inputs. The
                 weights are used to find the activation values of nodes
                 of a layer from the values of the previous layer.
                 Finding the ideal set of these weights for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rout:2020:ICA,
  author =       "Ranjeet Kumar Rout and SK. Sarif Hassan and Sanchit
                 Sindhwani and Hari Mohan Pandey and Saiyed Umer",
  title =        "Intelligent Classification and Analysis of Essential
                 Genes Using Quantitative Methods",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "38:1--38:21",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3343856",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3343856",
  abstract =     "Essential genes are considered to be the genes
                 required to sustain life of different organisms. These
                 genes encode proteins that maintain central metabolism,
                 DNA replications, translation of genes, and basic
                 cellular structure, and mediate the transport
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:ABM,
  author =       "Hongyi Zhang and Haoke Zhang and Sandeep Pirbhulal and
                 Wanqing Wu and Victor Hugo C. {De Albuquerque}",
  title =        "Active Balancing Mechanism for Imbalanced Medical Data
                 in Deep Learning-Based Classification Models",
  journal =      j-TOMM,
  volume =       "16",
  number =       "1s",
  pages =        "39:1--39:15",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357253",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Apr 30 10:35:21 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357253",
  abstract =     "Imbalanced data always has a serious impact on a
                 predictive model, and most under-sampling techniques
                 consume more time and suffer from loss of samples
                 containing critical information during imbalanced data
                 processing, especially in the biomedical \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Vellingiri:2020:SCB,
  author =       "Shanthi Vellingiri and Ryan P. McMahan and
                 Balakrishnan Prabhakaran",
  title =        "{SCeVE}: a Component-based Framework to Author Mixed
                 Reality Tours",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "40:1--40:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377353",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377353",
  abstract =     "Authoring a collaborative, interactive Mixed Reality
                 (MR) tour requires flexible design and development of
                 various software modules for tasks such as managing
                 geographically distributed participants, adaptable
                 travel and virtual camera techniques, data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2020:BDC,
  author =       "Jiaying Liu and Sijie Song and Chunhui Liu and Yanghao
                 Li and Yueyu Hu",
  title =        "A Benchmark Dataset and Comparison Study for
                 Multi-modal Human Action Analytics",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "41:1--41:24",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365212",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365212",
  abstract =     "Large-scale benchmarks provide a solid foundation for
                 the development of action analytics. Most of the
                 previous activity benchmarks focus on analyzing actions
                 in RGB videos. There is a lack of large-scale and
                 high-quality benchmarks for multi-modal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duan:2020:EFE,
  author =       "Mingxing Duan and Kenli Li and Aijia Ouyang and Khin
                 Nandar Win and Keqin Li and Qi Tian",
  title =        "{EGroupNet}: a Feature-enhanced Network for Age
                 Estimation with Novel Age Group Schemes",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "42:1--42:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3379449",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3379449",
  abstract =     "Although age estimation is easily affected by smiling,
                 race, gender, and other age-related attributes, most of
                 the researchers did not pay attention to the
                 correlations among these attributes. Moreover, many
                 researchers perform age estimation from a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Baez-Suarez:2020:SSS,
  author =       "Abraham B{\'a}ez-Su{\'a}rez and Nolan Shah and Juan
                 Arturo Nolazco-Flores and Shou-Hsuan S. Huang and
                 Omprakash Gnawali and Weidong Shi",
  title =        "{SAMAF}: Sequence-to-sequence Autoencoder Model for
                 Audio Fingerprinting",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "43:1--43:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380828",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380828",
  abstract =     "Audio fingerprinting techniques were developed to
                 index and retrieve audio samples by comparing a
                 content-based compact signature of the audio instead of
                 the entire audio sample, thereby reducing memory and
                 computational expense. Different techniques \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mettes:2020:SIB,
  author =       "Pascal Mettes and Dennis C. Koelma and Cees G. M.
                 Snoek",
  title =        "Shuffled {ImageNet} Banks for Video Event Detection
                 and Search",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "44:1--44:21",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377875",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377875",
  abstract =     "This article aims for the detection and search of
                 events in videos, where video examples are either
                 scarce or even absent during training. To enable such
                 event detection and search, ImageNet concept banks have
                 shown to be effective. Rather than \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Noori:2020:HAR,
  author =       "Farzan Majeed Noori and Michael Riegler and Md Zia
                 Uddin and Jim Torresen",
  title =        "Human Activity Recognition from Multiple Sensors Data
                 Using Multi-fusion Representations and {CNNs}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "45:1--45:19",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377882",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377882",
  abstract =     "With the emerging interest in the ubiquitous sensing
                 field, it has become possible to build assistive
                 technologies for persons during their daily life
                 activities to provide personalized feedback and
                 services. For instance, it is possible to detect an
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rossi:2020:DUB,
  author =       "Silvia Rossi and Cagri Ozcinar and Aljosa Smolic and
                 Laura Toni",
  title =        "Do Users Behave Similarly in {VR}? {Investigation} of
                 the User Influence on the System Design",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "46:1--46:26",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381846",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381846",
  abstract =     "With the overarching goal of developing user-centric
                 Virtual Reality (VR) systems, a new wave of studies
                 focused on understanding how users interact in VR
                 environments has recently emerged. Despite the intense
                 efforts, however, current literature still \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:LLF,
  author =       "Xiao Wang and Wu Liu and Jun Chen and Xiaobo Wang and
                 Chenggang Yan and Tao Mei",
  title =        "Listen, Look, and Find the One: Robust Person Search
                 with Multimodality Index",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "47:1--47:20",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380549",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380549",
  abstract =     "Person search with one portrait, which attempts to
                 search the targets in arbitrary scenes using one
                 portrait image at a time, is an essential yet
                 unexplored problem in the multimedia field. Existing
                 approaches, which predominantly depend on the visual
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Luo:2020:FFI,
  author =       "Xiaofan Luo and Fukoeng Wong and Haifeng Hu",
  title =        "{FIN}: Feature Integrated Network for Object
                 Detection",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "48:1--48:18",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381086",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381086",
  abstract =     "Multi-layer detection is a widely used method in the
                 field of object detection. It extracts multiple feature
                 maps with different resolutions from the backbone
                 network to detect objects of different scales, which
                 can effectively cope with the problem of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Akpinar:2020:PPP,
  author =       "Kutalmis Akpinar and Kien A. Hua",
  title =        "{PPNet}: Privacy Protected {CDN--ISP} Collaboration
                 for {QoS}-aware Multi-{CDN} Adaptive Video Streaming",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "49:1--49:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3379983",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3379983",
  abstract =     "Software-defined networking introduces opportunities
                 to optimize the Internet Service Provider's network and
                 to improve client experience for the Video-on-Demand
                 applications. Recent studies on SDN frameworks show
                 that traffic engineering methods allow \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tanwar:2020:CPP,
  author =       "Vishesh Kumar Tanwar and Balasubramanian Raman and
                 Amitesh Singh Rajput and Rama Bhargava",
  title =        "{CryptoLesion}: a Privacy-preserving Model for Lesion
                 Segmentation Using Whale Optimization over Cloud",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "50:1--50:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380743",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380743",
  abstract =     "The low-cost, accessing flexibility, agility, and
                 mobility of cloud infrastructures have attracted
                 medical organizations to store their high-resolution
                 data in encrypted form. Besides storage, these
                 infrastructures provide various image processing
                 services for plain (non-encrypted) images. Meanwhile,
                 the privacy and security of uploaded data depend upon
                 the reliability of the service provider(s). The
                 enforcement of laws towards privacy policies in
                 health-care organizations, for not disclosing their
                 patient's sensitive and private medical information,
                 restrict them to utilize these services. To address
                 these privacy concerns for melanoma detection, we
                 propose CryptoLesion, a privacy-preserving model for
                 segmenting lesion region using whale optimization
                 algorithm (WOA) over the cloud in the encrypted domain
                 (ED). The user's image is encrypted using a permutation
                 ordered binary number system and a random stumble
                 matrix. The task of segmentation is accomplished by
                 dividing an encrypted image into a pre-defined number
                 of clusters whose optimal centroids are obtained by WOA
                 in ED, followed by the assignment of each pixel of an
                 encrypted image to the unique centroid. The qualitative
                 and quantitative analysis of CryptoLesion is evaluated
                 over publicly available datasets provided in The
                 International Skin Imaging Collaboration Challenges in
                 2016, 2017, 2018, and PH2 dataset. The segmented
                 results obtained by CryptoLesion are found to be
                 comparable with the winners of respective challenges.
                 CryptoLesion is proved to be secure from a
                 probabilistic viewpoint and various cryptographic
                 attacks. To the best of our knowledge, CryptoLesion is
                 first moving towards the direction of lesion
                 segmentation in ED.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2020:DPC,
  author =       "Zhedong Zheng and Liang Zheng and Michael Garrett and
                 Yi Yang and Mingliang Xu and Yi-Dong Shen",
  title =        "Dual-path Convolutional Image-Text Embeddings with
                 Instance Loss",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "51:1--51:23",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3383184",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3383184",
  abstract =     "Matching images and sentences demands a fine
                 understanding of both modalities. In this article, we
                 propose a new system to discriminatively embed the
                 image and text to a shared visual-textual space. In
                 this field, most existing works apply the ranking
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2020:MPA,
  author =       "Xiaowen Huang and Shengsheng Qian and Quan Fang and
                 Jitao Sang and Changsheng Xu",
  title =        "Meta-path Augmented Sequential Recommendation with
                 Contextual Co-attention Network",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "52:1--52:24",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3382180",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3382180",
  abstract =     "It is critical to comprehensively and efficiently
                 learn user preferences for an effective sequential
                 recommender system. Existing sequential recommendation
                 methods mainly focus on modeling local preference from
                 users' historical behaviors, which largely \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2020:IMC,
  author =       "Lingxiang Wu and Min Xu and Shengsheng Qian and
                 Jianwei Cui",
  title =        "Image to Modern {Chinese} Poetry Creation via a
                 Constrained Topic-aware Model",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "53:1--53:21",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381858",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3381858",
  abstract =     "Artificial creativity has attracted increasing
                 research attention in the field of multimedia and
                 artificial intelligence. Despite the promising work on
                 poetry/painting/music generation, creating modern
                 Chinese poetry from images, which can significantly
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2020:RLV,
  author =       "Zhili Zhou and Q. M. Jonathan Wu and Yimin Yang and
                 Xingming Sun",
  title =        "Region-Level Visual Consistency Verification for
                 Large-Scale Partial-Duplicate Image Search",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "54:1--54:25",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3383582",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3383582",
  abstract =     "Most recent large-scale image search approaches build
                 on a bag-of-visual-words model, in which local features
                 are quantized and then efficiently matched between
                 images. However, the limited discriminability of local
                 features and the BOW quantization \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{He:2020:STS,
  author =       "Jiale He and Gaobo Yang and Xin Liu and Xiangling
                 Ding",
  title =        "Spatio-temporal Saliency-based Motion Vector
                 Refinement for Frame Rate Up-conversion",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "55:1--55:18",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3382506",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3382506",
  abstract =     "A spatio-temporal saliency-based frame rate
                 up-conversion (FRUC) approach is proposed, which
                 achieves better quality of interpolated frames and
                 invalidates existing texture variation-based FRUC
                 detectors. A spatio-temporal saliency model is designed
                 to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gelli:2020:LVE,
  author =       "Francesco Gelli and Tiberio Uricchio and Xiangnan He
                 and Alberto {Del Bimbo} and Tat-Seng Chua",
  title =        "Learning Visual Elements of Images for Discovery of
                 Brand Posts",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "56:1--56:21",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3385413",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3385413",
  abstract =     "Online Social Network Sites have become a primary
                 platform for brands and organizations to engage their
                 audience by sharing image and video posts on their
                 timelines. Different from traditional advertising,
                 these posts are not restricted to the products
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2020:HRR,
  author =       "Xian-Hua Han and Yinqiang Zheng and Jiande Sun and
                 Yen-Wei Chen",
  title =        "Hyperspectral Reconstruction with Redundant Camera
                 Spectral Sensitivity Functions",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2",
  pages =        "57:1--57:15",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3386313",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 16 10:45:32 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3386313",
  abstract =     "High-resolution hyperspectral (HS) reconstruction has
                 recently achieved significantly progress, among which
                 the method based on the fusion of the RGB and HS images
                 of the same scene can greatly improve the
                 reconstruction performance compared with those
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2020:ISI,
  author =       "Honghao Gao and Yudong Zhang",
  title =        "Introduction to the Special Issue on Smart
                 Communications and Networking for Future Video
                 Surveillance",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "58:1--58:2",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398382",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3398382",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jiang:2020:SDM,
  author =       "Yizhang Jiang and Xiaoqing Gu and Dingcheng Ji and
                 Pengjiang Qian and Jing Xue and Yuanpeng Zhang and
                 Jiaqi Zhu and Kaijian Xia and Shitong Wang",
  title =        "Smart Diagnosis: a Multiple-Source Transfer {TSK}
                 Fuzzy System for {EEG} Seizure Identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "59:1--59:21",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3340240",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3340240",
  abstract =     "To effectively identify electroencephalogram (EEG)
                 signals in multiple-source domains, a multiple-source
                 transfer learning-based Takagi-Sugeno-Kang (TSK) fuzzy
                 system (FS), called MST-TSK, is proposed, which
                 combines multiple-source transfer learning \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:DBD,
  author =       "Shui-Hua Wang and Yu-Dong Zhang",
  title =        "{DenseNet-201}-Based Deep Neural Network with
                 Composite Learning Factor and Precomputation for
                 Multiple Sclerosis Classification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "60:1--60:19",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341095",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3341095",
  abstract =     "(Aim) Multiple sclerosis is a neurological condition
                 that may cause neurologic disability. Convolutional
                 neural network can achieve good results, but tuning
                 hyperparameters of CNN needs expert knowledge and are
                 difficult and time-consuming. To identify \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xia:2020:CDB,
  author =       "Kaijian Xia and Hongsheng Yin and Yong Jin and Shi Qiu
                 and Hongru Zhao",
  title =        "Cross-Domain Brain {CT} Image Smart Segmentation via
                 Shared Hidden Space Transfer {FCM} Clustering",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "61:1--61:21",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357233",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357233",
  abstract =     "Clustering is an important issue in brain medical
                 image segmentation. Original medical images used for
                 clinical diagnosis are often insufficient for
                 clustering in the current domain. As there are
                 sufficient medical images in the related domains,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2020:STD,
  author =       "Yonggang Li and Chunping Liu and Yi Ji and Shengrong
                 Gong and Haibao Xu",
  title =        "Spatio-Temporal Deep Residual Network with
                 Hierarchical Attentions for Video Event Recognition",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "62:1--62:21",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3378026",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3378026",
  abstract =     "Event recognition in surveillance video has gained
                 extensive attention from the computer vision community.
                 This process still faces enormous challenges due to the
                 tiny inter-class variations that are caused by various
                 facets, such as severe occlusion, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Si:2020:MLT,
  author =       "Wen Si and Cong Liu and Zhongqin Bi and Meijing Shan",
  title =        "Modeling Long-Term Dependencies from Videos Using Deep
                 Multiplicative Neural Networks",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "63:1--63:19",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357797",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357797",
  abstract =     "Understanding temporal dependencies of videos is
                 fundamental for vision problems, but deep
                 learning-based models are still insufficient in this
                 field. In this article, we propose a novel deep
                 multiplicative neural network (DMNN) for learning
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2020:PCA,
  author =       "Suguo Zhu and Xiaoxian Yang and Jun Yu and Zhenying
                 Fang and Meng Wang and Qingming Huang",
  title =        "Proposal Complementary Action Detection",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "64:1--64:12",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3361845",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3361845",
  abstract =     "Temporal action detection not only requires correct
                 classification but also needs to detect the start and
                 end times of each action accurately. However,
                 traditional approaches always employ sliding windows or
                 actionness to predict the actions, and it is \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2020:NTF,
  author =       "Chenxi Huang and Yisha Lan and Guokai Zhang and Gaowei
                 Xu and Landu Jiang and Nianyin Zeng and Jenhong Tan and
                 E. Y. K. Ng and Yongqiang Cheng and Ningzhi Han and
                 Rongrong Ji and Yonghong Peng",
  title =        "A New Transfer Function for Volume Visualization of
                 Aortic Stent and Its Application to Virtual Endoscopy",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "65:1--65:14",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3373358",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3373358",
  abstract =     "Aortic stent has been widely used in restoring
                 vascular stenosis and assisting patients with
                 cardiovascular disease. The effective visualization of
                 aortic stent is considered to be critical to ensure the
                 effectiveness and functions of the aortic stent
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zink:2020:IBP,
  author =       "Michael Zink and Laura Toni and Ali C. Begen",
  title =        "Introduction to the Best Papers from the {ACM
                 Multimedia Systems (MMSys) 2019 and Co-Located
                 Workshops}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "66:1--66:2",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398384",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3398384",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:PLB,
  author =       "Rui-Xiao Zhang and Ming Ma and Tianchi Huang and
                 Haitian Pang and Xin Yao and Chenglei Wu and Lifeng
                 Sun",
  title =        "A Practical Learning-based Approach for Viewer
                 Scheduling in the Crowdsourced Live Streaming",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "67:1--67:22",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397226",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3397226",
  abstract =     "Scheduling viewers effectively among different Content
                 Delivery Network (CDN) providers is challenging owing
                 to the extreme diversity in the crowdsourced live
                 streaming (CLS) scenarios. Abundant algorithms have
                 been proposed in recent years, which, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Altamimi:2020:QFD,
  author =       "Sa'di Altamimi and Shervin Shirmohammadi",
  title =        "{QoE}-Fair {DASH} Video Streaming Using Server-side
                 Reinforcement Learning",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "68:1--68:21",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397227",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3397227",
  abstract =     "To design an optimal adaptive video streaming method,
                 video service providers need to consider both the
                 efficiency and the fairness of the Quality of
                 Experience (QoE) of their users. In Reference [8], we
                 proposed a server-side QoE-fair rate adaptation
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bentaleb:2020:PAA,
  author =       "Abdelhak Bentaleb and Christian Timmerer and Ali C.
                 Begen and Roger Zimmermann",
  title =        "Performance Analysis of {ACTE}: a Bandwidth Prediction
                 Method for Low-latency Chunked Streaming",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "69:1--69:24",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3387921",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3387921",
  abstract =     "HTTP adaptive streaming with chunked transfer encoding
                 can offer low-latency streaming without sacrificing the
                 coding efficiency. This allows media segments to be
                 delivered while still being packaged. However,
                 conventional schemes often make widely \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pham:2020:ESR,
  author =       "Stefan Pham and Patrick Heeren and Calvin Schmidt and
                 Daniel Silhavy and Stefan Arbanowski",
  title =        "Evaluation of Shared Resource Allocation Using {SAND}
                 for {ABR} Streaming",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "70:1--70:18",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3388926",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3388926",
  abstract =     "Adaptive bitrate media streaming clients adjust the
                 quality of media content depending on the current
                 network conditions. The shared resource allocation
                 (SRA) feature defined in MPEG-SAND (server and network
                 assisted DASH) allows servers to allocate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gutterman:2020:RRT,
  author =       "Craig Gutterman and Katherine Guo and Sarthak Arora
                 and Trey Gilliland and Xiaoyang Wang and Les Wu and
                 Ethan Katz-Bassett and Gil Zussman",
  title =        "{Requet}: Real-Time {QoE} Metric Detection for
                 Encrypted {YouTube} Traffic",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "71:1--71:28",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394498",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3394498",
  abstract =     "As video traffic dominates the Internet, it is
                 important for operators to detect video quality of
                 experience (QoE) to ensure adequate support for video
                 traffic. With wide deployment of end-to-end encryption,
                 traditional deep packet inspection-based traffic
                 monitoring approaches are becoming ineffective. This
                 poses a challenge for network operators to monitor user
                 QoE and improve upon their experience. To resolve this
                 issue, we develop and present a system for REal-time
                 QUality of experience metric detection for Encrypted
                 Traffic --- Requet --- which is suitable for network
                 middlebox deployment. Requet uses a detection algorithm
                 that we develop to identify video and audio chunks from
                 the IP headers of encrypted traffic. Features extracted
                 from the chunk statistics are used as input to a
                 machine learning algorithm to predict QoE metrics,
                 specifically buffer warning (low buffer, high buffer),
                 video state (buffer increase, buffer decay, steady,
                 stall), and video resolution. We collect a large
                 YouTube dataset consisting of diverse video assets
                 delivered over various WiFi and LTE network conditions
                 to evaluate the performance. We compare Requet with a
                 baseline system based on previous work and show that
                 Requet outperforms the baseline system in accuracy of
                 predicting buffer low warning, video state, and video
                 resolution by $ 1.12 \times $, $ 1.53 \times $, and $
                 3.14 \times $, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2020:ATL,
  author =       "Xinjue Hu and Jingming Shan and Yu Liu and Lin Zhang
                 and Shervin Shirmohammadi",
  title =        "An Adaptive Two-Layer Light Field Compression Scheme
                 Using {GNN}-Based Reconstruction",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "72:1--72:23",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3395620",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3395620",
  abstract =     "As a new form of volumetric media, Light Field (LF)
                 can provide users with a true six degrees of freedom
                 immersive experience because LF captures the scene with
                 photo-realism, including aperture-limited changes in
                 viewpoint. But uncompressed LF data is too large for
                 network transmission, which is the reason why LF
                 compression has become an important research topic. One
                 of the more recent approaches for LF compression is to
                 reduce the angular resolution of the input LF during
                 compression and to use LF reconstruction to recover the
                 discarded viewpoints during decompression. Following
                 this approach, we propose a new LF reconstruction
                 algorithm based on Graph Neural Networks; we show that
                 it can achieve higher compression and better quality
                 compared to existing reconstruction methods, although
                 suffering from the same problem as those methods ---
                 the inability to deal effectively with high-frequency
                 image components. To solve this problem, we propose an
                 adaptive two-layer compression architecture that
                 separates high-frequency and low-frequency components
                 and compresses each with a different strategy so that
                 the performance can become robust and controllable.
                 Experiments with multiple datasets show that our
                 proposed scheme is capable of providing a decompression
                 quality of above 40 dB, and can significantly improve
                 compression efficiency compared with similar LF
                 reconstruction schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Claypool:2020:IMD,
  author =       "Mark Claypool and Andy Cockburn and Carl Gutwin",
  title =        "The Impact of Motion and Delay on Selecting Game
                 Targets with a Mouse",
  journal =      j-TOMM,
  volume =       "16",
  number =       "2s",
  pages =        "73:1--73:24",
  month =        jul,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3390464",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sun Jul 19 08:56:56 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3390464",
  abstract =     "All real-time computer games, particularly networked
                 computer games, have a delay from when a player starts
                 an action (e.g., clicking the mouse) until the game
                 renders the result (e.g., firing a projectile). This
                 delay can degrade both player \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Anonymous:2020:TCO,
  author =       "Anonymous",
  title =        "Table of Contents: Online Supplement Volume 16, Number
                 1s",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "74:1--74:5",
  month =        sep,
  year =         "2020",
  DOI =          "https://doi.org/10.1145/3409367",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:45:43 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409367",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Yang:2020:CLR,
  author =       "Liang Yang and Haifeng Hu and Songlong Xing and
                 Xinlong Lu",
  title =        "Constrained {LSTM} and Residual Attention for Image
                 Captioning",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "75:1--75:18",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3386725",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3386725",
  abstract =     "Visual structure and syntactic structure are essential
                 in images and texts, respectively. Visual structure
                 depicts both entities in an image and their
                 interactions, whereas syntactic structure in texts can
                 reflect the part-of-speech constraints between
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2020:DTN,
  author =       "Donghuo Zeng and Yi Yu and Keizo Oyama",
  title =        "Deep Triplet Neural Networks with Cluster-{CCA} for
                 Audio-Visual Cross-Modal Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "76:1--76:23",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3387164",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3387164",
  abstract =     "Cross-modal retrieval aims to retrieve data in one
                 modality by a query in another modality, which has been
                 a very interesting research issue in the field of
                 multimedia, information retrieval, and computer vision,
                 and database. Most existing works focus \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Su:2020:MVG,
  author =       "Yu-Ting Su and Wen-Hui Li and Wei-Zhi Nie and An-An
                 Liu",
  title =        "Multi-View Graph Matching for {$3$D} Model Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "77:1--77:20",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3387920",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3387920",
  abstract =     "3D model retrieval has been widely utilized in
                 numerous domains, such as computer-aided design,
                 digital entertainment, and virtual reality. Recently,
                 many graph-based methods have been proposed to address
                 this task by using multi-view information of 3D
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fan:2020:RAN,
  author =       "Hehe Fan and Linchao Zhu and Yi Yang and Fei Wu",
  title =        "Recurrent Attention Network with Reinforced Generator
                 for Visual Dialog",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "78:1--78:16",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3390891",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3390891",
  abstract =     "In Visual Dialog, an agent has to parse temporal
                 context in the dialog history and spatial context in
                 the image to hold a meaningful dialog with humans. For
                 example, to answer ``what is the man on her left
                 wearing?'' the agent needs to (1) analyze the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2020:ABM,
  author =       "Feiran Huang and Kaimin Wei and Jian Weng and Zhoujun
                 Li",
  title =        "Attention-Based Modality-Gated Networks for Image-Text
                 Sentiment Analysis",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "79:1--79:19",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3388861",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3388861",
  abstract =     "Sentiment analysis of social multimedia data has
                 attracted extensive research interest and has been
                 applied to many tasks, such as election prediction and
                 products evaluation. Sentiment analysis of one modality
                 (e.g., text or image) has been broadly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:PSE,
  author =       "Shangfei Wang and Longfei Hao and Qiang Ji",
  title =        "Posed and Spontaneous Expression Distinction Using
                 Latent Regression {Bayesian} Networks",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "80:1--80:18",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391290",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3391290",
  abstract =     "Facial spatial patterns can help distinguish between
                 posed and spontaneous expressions, but this information
                 has not been thoroughly leveraged by current studies.
                 We present several latent regression Bayesian networks
                 (LRBNs) to capture the patterns \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2020:UNA,
  author =       "Fangyu Liu and R{\'e}mi Lebret and Didier Orel and
                 Philippe Sordet and Karl Aberer",
  title =        "Upgrading the Newsroom: an Automated Image Selection
                 System for News Articles",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "81:1--81:28",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3396520",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3396520",
  abstract =     "We propose an automated image selection system to
                 assist photo editors in selecting suitable images for
                 news articles. The system fuses multiple textual
                 sources extracted from news articles and accepts
                 multilingual inputs. It is equipped with char-level
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2020:FSM,
  author =       "Chenlei Lv and Zhongke Wu and Xingce Wang and Mingquan
                 Zhou",
  title =        "{$3$D} Facial Similarity Measurement and Its
                 Application in Facial Organization",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "82:1--82:20",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397765",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3397765",
  abstract =     "We propose a novel framework for 3D facial similarity
                 measurement and its application in facial organization.
                 The construction of the framework is based on Kendall
                 shape space theory. Kendall shape space is a quotient
                 space that is constructed by shape \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2020:ICJ,
  author =       "Jin Yuan and Lei Zhang and Songrui Guo and Yi Xiao and
                 Zhiyong Li",
  title =        "Image Captioning with a Joint Attention Mechanism by
                 Visual Concept Samples",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "83:1--83:22",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394955",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394955",
  abstract =     "The attention mechanism has been established as an
                 effective method for generating caption words in image
                 captioning; it explores one noticed subregion in an
                 image to predict a related caption word. However, even
                 though the attention mechanism could \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2020:IMP,
  author =       "Xun Wang and Yan Tian and Xuran Zhao and Tao Yang and
                 Judith Gelernter and Jialei Wang and Guohua Cheng and
                 Wei Hu",
  title =        "Improving Multiperson Pose Estimation by Mask-aware
                 Deep Reinforcement Learning",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "84:1--84:18",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397340",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3397340",
  abstract =     "Research on single-person pose estimation based on
                 deep neural networks has recently witnessed progress in
                 both accuracy and execution efficiency. However,
                 multiperson pose estimation is still a challenging
                 topic, partially because the object regions \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2020:LJS,
  author =       "Shenming Feng and Haifeng Hu",
  title =        "Learning Joint Structure for Human Pose Estimation",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "85:1--85:17",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3392302",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3392302",
  abstract =     "Recently, tremendous progress has been achieved on
                 human pose estimation with the development of
                 convolutional neural networks (CNNs). However, current
                 methods still suffer from severe occlusion, back view,
                 and large pose variation due to the lack of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2020:SSI,
  author =       "Feng Lin and Bin Li and Wengang Zhou and Houqiang Li
                 and Yan Lu",
  title =        "Single-stage Instance Segmentation",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "86:1--86:19",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3387926",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3387926",
  abstract =     "Albeit the highest accuracy of object detection is
                 generally acquired by multi-stage detectors, like R-CNN
                 and its extension approaches, the single-stage object
                 detectors also achieve remarkable performance with
                 faster execution and higher scalability. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jiang:2020:FSF,
  author =       "Shuqiang Jiang and Weiqing Min and Yongqiang Lyu and
                 Linhu Liu",
  title =        "Few-shot Food Recognition via Multi-view
                 Representation Learning",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "87:1--87:20",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391624",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3391624",
  abstract =     "This article considers the problem of few-shot
                 learning for food recognition. Automatic food
                 recognition can support various applications, e.g.,
                 dietary assessment and food journaling. Most existing
                 works focus on food recognition with large numbers of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ho:2020:SGD,
  author =       "Trang-Thi Ho and John Jethro Virtusio and Yung-Yao
                 Chen and Chih-Ming Hsu and Kai-Lung Hua",
  title =        "Sketch-guided Deep Portrait Generation",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "88:1--88:18",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3396237",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3396237",
  abstract =     "Generating a realistic human class image from a sketch
                 is a unique and challenging problem considering that
                 the human body has a complex structure that must be
                 preserved. Additionally, input sketches often lack
                 important details that are crucial in the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Srivastava:2020:DAI,
  author =       "Gargi Srivastava and Rajeev Srivastava",
  title =        "Design, Analysis, and Implementation of Efficient
                 Framework for Image Annotation",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "89:1--89:24",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3386249",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3386249",
  abstract =     "In this article, a general framework of image
                 annotation is proposed by involving salient object
                 detection (SOD), feature extraction, feature selection,
                 and multi-label classification. For SOD,
                 Augmented-Gradient Vector Flow (A-GVF) is proposed,
                 which \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2020:KAN,
  author =       "Dongyang Zhang and Jie Shao and Heng Tao Shen",
  title =        "Kernel Attention Network for Single Image
                 Super-Resolution",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "90:1--90:15",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398685",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3398685",
  abstract =     "Recently, attention mechanisms have shown a developing
                 tendency toward convolutional neural network (CNN), and
                 some representative attention mechanisms, i.e., channel
                 attention (CA) and spatial attention (SA) have been
                 fully applied to single image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2020:BIQ,
  author =       "Yutao Liu and Ke Gu and Xiu Li and Yongbing Zhang",
  title =        "Blind Image Quality Assessment by Natural Scene
                 Statistics and Perceptual Characteristics",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "91:1--91:91",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3414837",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3414837",
  abstract =     "Opinion-unaware blind image quality assessment (OU
                 BIQA) refers to establishing a blind quality prediction
                 model without using the expensive subjective quality
                 scores, which is a highly promising direction in the
                 BIQA research. In this article, we focus \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Francis:2020:UTF,
  author =       "Jobin Francis and Baburaj M. and Sudhish N. George",
  title =        "A Unified Tensor Framework for Clustering and
                 Simultaneous Reconstruction of Incomplete Imaging
                 Data",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3",
  pages =        "92:1--92:24",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399806",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Sep 5 18:46:01 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3399806",
  abstract =     "Incomplete observations in the data are always
                 troublesome to data clustering algorithms. In fact,
                 most of the well-received techniques are not designed
                 to encounter such imperative scenarios. Hence,
                 clustering of images under incomplete samples is an
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sharma:2021:ISI,
  author =       "Suraj Sharma and Xuyun Zhang and Hesham El-Sayed and
                 Zhiyuan Tan",
  title =        "Introduction to the Special Issue on Privacy and
                 Security in Evolving {Internet of Multimedia Things}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "93:1--93:3",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3423955",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3423955",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:LBO,
  author =       "Xiaolong Xu and Qihe Huang and Yiwen Zhang and
                 Shancang Li and Lianyong Qi and Wanchun Dou",
  title =        "An {LSH}-based Offloading Method for {IoMT} Services
                 in Integrated Cloud-Edge Environment",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "94:1--94:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408319",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408319",
  abstract =     "Benefiting from the massive available data provided by
                 Internet of multimedia things (IoMT), enormous
                 intelligent services requiring information of various
                 types to make decisions are emerging. Generally, the
                 IoMT devices are equipped with limited \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gati:2021:DPT,
  author =       "Nicholaus J. Gati and Laurence T. Yang and Jun Feng
                 and Yijun Mo and Mamoun Alazab",
  title =        "Differentially Private Tensor Train Deep Computation
                 for {Internet of Multimedia Things}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "95:1--95:20",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3421276",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3421276",
  abstract =     "The significant growth of the Internet of Things (IoT)
                 takes a key and active role in healthcare, smart homes,
                 smart manufacturing, and wearable gadgets. Due to
                 complexness and difficulty in processing multimedia
                 data, the IoT based scheme, namely \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2021:FBS,
  author =       "Haoran Liang and Jun Wu and Xi Zheng and Mengshi Zhang
                 and Jianhua Li and Alireza Jolfaei",
  title =        "Fog-based Secure Service Discovery for {Internet of
                 Multimedia Things}: a Cross-blockchain Approach",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "96:1--96:23",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3415151",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3415151",
  abstract =     "The Internet of Multimedia Things (IoMT) has become
                 the backbone of innumerable multimedia applications in
                 various fields. The wide application of IoMT not only
                 makes our life convenient but also brings challenges to
                 service discovery. Service discovery \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2021:ASI,
  author =       "Zhihan Lv and Liang Qiao and Houbing Song",
  title =        "Analysis of the Security of {Internet of Multimedia
                 Things}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "97:1--97:16",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398201",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3398201",
  abstract =     "To study the security performance of the Internet of
                 multimedia things on the privacy protection of user
                 identity, behavior trajectory, and preference under the
                 new information technology industry wave, in this
                 study, aiming at the problems of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sahoo:2021:SAD,
  author =       "Kshira Sagar Sahoo and Deepak Puthal",
  title =        "{SDN}-Assisted {DDoS} Defense Framework for the
                 {Internet of Multimedia Things}",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "98:1--98:18",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394956",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394956",
  abstract =     "The Internet of Things is visualized as a fundamental
                 networking model that bridges the gap between the cyber
                 and real-world entity. Uniting the real-world object
                 with virtualization technology is opening further
                 opportunities for innovation in nearly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Namasudra:2021:SMU,
  author =       "Suyel Namasudra and Rupak Chakraborty and Abhishek
                 Majumder and Nageswara Rao Moparthi",
  title =        "Securing Multimedia by Using {DNA}-Based Encryption in
                 the Cloud Computing Environment",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "99:1--99:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3392665",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3392665",
  abstract =     "Today, the size of a multimedia file is increasing day
                 by day from gigabytes to terabytes or even petabytes,
                 mainly because of the evolution of a large amount of
                 real-time data. As most of the multimedia files are
                 transmitted through the internet, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fang:2021:PPM,
  author =       "Liming Fang and Changchun Yin and Juncen Zhu and
                 Chunpeng Ge and M. Tanveer and Alireza Jolfaei and
                 Zehong Cao",
  title =        "Privacy Protection for Medical Data Sharing in Smart
                 Healthcare",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "100:1--100:18",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408322",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408322",
  abstract =     "In virtue of advances in smart networks and the cloud
                 computing paradigm, smart healthcare is transforming.
                 However, there are still challenges, such as storing
                 sensitive data in untrusted and controlled
                 infrastructure and ensuring the secure \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2021:DHC,
  author =       "A. K. Singh",
  title =        "Data Hiding: Current Trends, Innovation and Potential
                 Challenges",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "101:1--101:16",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3382772",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3382772",
  abstract =     "With the widespread growth of digital information and
                 improved internet technologies, the demand for improved
                 information security techniques has significantly
                 increased due to privacy leakage, identity theft,
                 illegal copying, and data distribution. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2021:MLM,
  author =       "Hezhen Hu and Wengang Zhou and Xingze Li and Ning Yan
                 and Houqiang Li",
  title =        "{MV2Flow}: Learning Motion Representation for Fast
                 Compressed Video Action Recognition",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "102:1--102:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3422360",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3422360",
  abstract =     "In video action recognition, motion is a very crucial
                 clue, which is usually represented by optical flow.
                 However, optical flow is computationally expensive to
                 obtain, which becomes the bottleneck for the efficiency
                 of traditional action recognition \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cui:2021:SSI,
  author =       "Chaoran Cui and Peiguang Lin and Xiushan Nie and Muwei
                 Jian and Yilong Yin",
  title =        "Social-sensed Image Aesthetics Assessment",
  journal =      j-TOMM,
  volume =       "16",
  number =       "3s",
  pages =        "103:1--103:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3414843",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 22 06:57:30 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3414843",
  abstract =     "Image aesthetics assessment aims to endow computers
                 with the ability to judge the aesthetic values of
                 images, and its potential has been recognized in a
                 variety of applications. Most previous studies perform
                 aesthetics assessment purely based on image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sharma:2021:TCO,
  author =       "Suraj Sharma",
  title =        "Table of Contents: Online Supplement Volume 16, Number
                 3s",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "117e-1:117e-2",
  month =        jan,
  year =         "2021",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 10:01:20 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J961",
}

@Article{Shao:2021:EBR,
  author =       "Huiru Shao and Jing Li and Jia Zhang and Hui Yu and
                 Jiande Sun",
  title =        "Eye-based Recognition for User Identification on
                 Mobile Devices",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "117:1--117:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399659",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3399659",
  abstract =     "User identification is becoming more and more
                 important for Apps on mobile devices. However, the
                 identity recognition based on eyes, e.g., iris
                 recognition, is rarely used on mobile devices comparing
                 with those based on face and fingerprint due to its
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:NKT,
  author =       "Zuquan Liu and Guopu Zhu and Yuan-Gen Wang and
                 Jianquan Yang and Sam Kwong",
  title =        "A Novel $ (t, s, k, n)$-Threshold Visual Secret
                 Sharing Scheme Based on Access Structure Partition",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "118:1--118:21",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418212",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418212",
  abstract =     "Visual secret sharing (VSS) is a new technique for
                 sharing a binary image into multiple shadows. For VSS,
                 the original image can be reconstructed from the
                 shadows in any qualified set, but cannot be
                 reconstructed from those in any forbidden set. In most
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "118",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Becattini:2021:DPA,
  author =       "Federico Becattini and Tiberio Uricchio and Lorenzo
                 Seidenari and Lamberto Ballan and Alberto {Del Bimbo}",
  title =        "Am {I} Done? {Predicting} Action Progress in Videos",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "119:1--119:24",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3402447",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3402447",
  abstract =     "In this article, we deal with the problem of
                 predicting action progress in videos. We argue that
                 this is an extremely important task, since it can be
                 valuable for a wide range of interaction applications.
                 To this end, we introduce a novel approach, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "119",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ruan:2021:CDI,
  author =       "Weijian Ruan and Chao Liang and Yi Yu and Zheng Wang
                 and Wu Liu and Jun Chen and Jiayi Ma",
  title =        "Correlation Discrepancy Insight Network for Video
                 Re-identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "120:1--120:21",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3402666",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3402666",
  abstract =     "Video-based person re-identification (ReID) aims at
                 re-identifying a specified person sequence from videos
                 that were captured by disjoint cameras. Most existing
                 works on this task ignore the quality discrepancy
                 across frames by using all video frames to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "120",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2021:SSI,
  author =       "Xin Yang and Yu Qiao and Shaozhe Chen and Shengfeng He
                 and Baocai Yin and Qiang Zhang and Xiaopeng Wei and
                 Rynson W. H. Lau",
  title =        "Smart Scribbles for Image Matting",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "121:1--121:21",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408323",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408323",
  abstract =     "Image matting is an ill-posed problem that usually
                 requires additional user input, such as trimaps or
                 scribbles. Drawing a fine trimap requires a large
                 amount of user effort, while using scribbles can hardly
                 obtain satisfactory alpha mattes for non-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "121",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2021:DID,
  author =       "Chenggang Yan and Zhisheng Li and Yongbing Zhang and
                 Yutao Liu and Xiangyang Ji and Yongdong Zhang",
  title =        "Depth Image Denoising Using Nuclear Norm and Learning
                 Graph Model",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "122:1--122:17",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3404374",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3404374",
  abstract =     "Depth image denoising is increasingly becoming the hot
                 research topic nowadays, because it reflects the
                 three-dimensional scene and can be applied in various
                 fields of computer vision. But the depth images
                 obtained from depth camera usually contain \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "122",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2021:MAS,
  author =       "Lin Zhu and Xiurong Jiang and Jianing Li and Yuanhong
                 Hao and Yonghong Tian",
  title =        "Motion-Aware Structured Matrix Factorization for
                 Foreground Detection in Complex Scenes",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "123:1--123:23",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3407188",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3407188",
  abstract =     "Foreground detection is one of the key steps in
                 computer vision applications. Many foreground and
                 background models have been proposed and achieved
                 promising performance in static scenes. However, due to
                 challenges such as dynamic background, irregular
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "123",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wei:2021:CNL,
  author =       "Yang Wei and Zhuzhu Wang and Bin Xiao and Ximeng Liu
                 and Zheng Yan and Jianfeng Ma",
  title =        "Controlling Neural Learning Network with Multiple
                 Scales for Image Splicing Forgery Detection",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "124:1--124:22",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408299",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408299",
  abstract =     "The guarantee of social stability comes from many
                 aspects of life, and image information security as one
                 of them is being subjected to various malicious
                 attacks. As a means of information attack, image
                 splicing forgery refers to copying some areas of an
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "124",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2021:VRS,
  author =       "Kun Zeng and Jiangchuan Hu and Yongyi Gong and
                 Kanoksak Wattanachote and Runpeng Yu and Xiaonan Luo",
  title =        "Vertical Retargeting for Stereoscopic Images via
                 Stereo Seam Carving",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "125:1--125:22",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408295",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408295",
  abstract =     "Vertical retargeting for stereoscopic images using
                 seam manipulation-based approaches has remained an open
                 challenge over the years. Even though horizontal
                 retargeting had attracted a huge amount of interest,
                 its seam coupling strategies were not \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "125",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tian:2021:PIC,
  author =       "Tao Tian and Hanli Wang and Sam Kwong and C.-C. Jay
                 Kuo",
  title =        "Perceptual Image Compression with Block-Level Just
                 Noticeable Difference Prediction",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "126:1--126:15",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408320",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408320",
  abstract =     "A block-level perceptual image compression framework
                 is proposed in this work, including a block-level just
                 noticeable difference (JND) prediction model and a
                 preprocessing scheme. Specifically speaking,
                 block-level JND values are first deduced by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "126",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{He:2021:MFU,
  author =       "Xin He and Qiong Liu and You Yang",
  title =        "Make Full Use of Priors: Cross-View Optimized Filter
                 for Multi-View Depth Enhancement",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "127:1--127:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408293",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408293",
  abstract =     "Multi-view video plus depth (MVD) is the promising and
                 widely adopted data representation for future 3D visual
                 applications and interactive media. However,
                 compression distortions on depth videos impede the
                 development of such applications, and filters
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:AAB,
  author =       "Xiaoxiao Liu and Qingyang Xu",
  title =        "Adaptive Attention-based High-level Semantic
                 Introduction for Image Caption",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "128:1--128:22",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409388",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409388",
  abstract =     "There have been several attempts to integrate a
                 spatial visual attention mechanism into an image
                 caption model and introduce semantic concepts as the
                 guidance of image caption generation. High-level
                 semantic information consists of the abstractedness
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "128",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{UlFazal:2021:EIC,
  author =       "Muhammad Abu {Ul Fazal} and Sam Ferguson and Andrew
                 Johnston",
  title =        "Evaluation of Information Comprehension in Concurrent
                 Speech-based Designs",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "129:1--129:19",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409463",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409463",
  abstract =     "In human-computer interaction, particularly in
                 multimedia delivery, information is communicated to
                 users sequentially, whereas users are capable of
                 receiving information from multiple sources
                 concurrently. This mismatch indicates that a sequential
                 mode \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "129",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2021:LDA,
  author =       "Yucheng Zhu and Guangtao Zhai and Xiongkuo Min and
                 Jiantao Zhou",
  title =        "Learning a Deep Agent to Predict Head Movement in
                 360-Degree Images",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "130:1--130:23",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410455",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410455",
  abstract =     "Virtual reality adequately stimulates senses to trick
                 users into accepting the virtual environment. To create
                 a sense of immersion, high-resolution images are
                 required to satisfy human visual system, and low
                 latency is essential for smooth operations, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "130",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nie:2021:MMI,
  author =       "Weizhi Nie and Qi Liang and Yixin Wang and Xing Wei
                 and Yuting Su",
  title =        "{MMFN}: Multimodal Information Fusion Networks for
                 {$3$D} Model Classification and Retrieval",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "131:1--131:22",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410439",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410439",
  abstract =     "In recent years, research into 3D shape recognition in
                 the field of multimedia and computer vision has
                 attracted wide attention. With the rapid development of
                 deep learning, various deep models have achieved
                 state-of-the-art performance based on \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "131",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2021:GRC,
  author =       "Zhongying Zhao and Yonghao Yang and Chao Li and
                 Liqiang Nie",
  title =        "{GuessUNeed}: Recommending Courses via Neural
                 Attention Network and Course Prerequisite Relation
                 Embeddings",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "132:1--132:17",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410441",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410441",
  abstract =     "Massive Open Online Courses, offering millions of
                 high-quality courses from prestigious universities and
                 prominent experts, are picking up momentum in
                 popularity. Although users enrolling on MOOCs have free
                 access to abundant knowledge, they may easily
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "132",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2021:KDE,
  author =       "Yi Huang and Xiaoshan Yang and Junyu Gao and Jitao
                 Sang and Changsheng Xu",
  title =        "Knowledge-driven Egocentric Multimodal Activity
                 Recognition",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "133:1--133:133",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409332",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409332",
  abstract =     "Recognizing activities from egocentric multimodal data
                 collected by wearable cameras and sensors, is gaining
                 interest, as multimodal methods always benefit from the
                 complementarity of different modalities. However, since
                 high-dimensional videos contain \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:PBS,
  author =       "Yaoyu Li and Hantao Yao and Tianzhu Zhang and
                 Changsheng Xu",
  title =        "Part-based Structured Representation Learning for
                 Person Re-identification",
  journal =      j-TOMM,
  volume =       "16",
  number =       "4",
  pages =        "134:1--134:22",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3412384",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Feb 10 10:15:11 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3412384",
  abstract =     "Person re-identification aims to match person of
                 interest under non-overlapping camera views. Therefore,
                 how to generate a robust and discriminative
                 representation is crucial for person re-identification.
                 Mining local clues from human body parts to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "134",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jin:2021:MTL,
  author =       "Xin Jin and Jianfeng Xu and Kazuyuki Tasaka and Zhibo
                 Chen",
  title =        "Multi-task Learning-based All-in-one Collaboration
                 Framework for Degraded Image Super-resolution",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "21:1--21:21",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3417333",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3417333",
  abstract =     "In this article, we address the degraded image
                 super-resolution problem in a multi-task learning (MTL)
                 manner. To better share representations between
                 multiple tasks, we propose an all-in-one collaboration
                 framework (ACF) with a learnable ``junction'' unit
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tran:2021:CQM,
  author =       "Huyen T. T. Tran and Nam Pham Ngoc and Tobias
                 Ho{\ss}feld and Michael Seufert and Truong Cong Thang",
  title =        "Cumulative Quality Modeling for {HTTP} Adaptive
                 Streaming",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "22:1--22:24",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3423421",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3423421",
  abstract =     "HTTP Adaptive Streaming has become the de facto choice
                 for multimedia delivery. However, the quality of
                 adaptive video streaming may fluctuate strongly during
                 a session due to throughput fluctuations. So, it is
                 important to evaluate the quality of a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:SVM,
  author =       "Tong Xu and Peilun Zhou and Linkang Hu and Xiangnan He
                 and Yao Hu and Enhong Chen",
  title =        "Socializing the Videos: a Multimodal Approach for
                 Social Relation Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "23:1--23:23",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3416493",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3416493",
  abstract =     "As a crucial task for video analysis, social relation
                 recognition for characters not only provides
                 semantically rich description of video content but also
                 supports intelligent applications, e.g., video
                 retrieval and visual question answering. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2021:RSI,
  author =       "Xuehu Yan and Lintao Liu and Longlong Li and Yuliang
                 Lu",
  title =        "Robust Secret Image Sharing Resistant to Noise in
                 Shares",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "24:1--24:22",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3419750",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3419750",
  abstract =     "A secret image is split into shares in the generation
                 phase of secret image sharing (SIS) for a threshold. In
                 the recovery phase, the secret image is recovered when
                 any or more shares are collected, and each collected
                 share is generally assumed to be \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:ANM,
  author =       "Mingliang Xu and Qingfeng Li and Jianwei Niu and Hao
                 Su and Xiting Liu and Weiwei Xu and Pei Lv and Bing
                 Zhou and Yi Yang",
  title =        "{ART-UP}: a Novel Method for Generating
                 Scanning-Robust Aesthetic {QR} Codes",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "25:1--25:23",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418214",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418214",
  abstract =     "Quick response (QR) codes are usually scanned in
                 different environments, so they must be robust to
                 variations in illumination, scale, coverage, and camera
                 angles. Aesthetic QR codes improve the visual quality,
                 but subtle changes in their appearance may \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2021:CIR,
  author =       "Peihao Yang and Linghe Kong and Meikang Qiu and Xue
                 Liu and Guihai Chen",
  title =        "Compressed Imaging Reconstruction with Sparse Random
                 Projection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "26:1--26:25",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447431",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447431",
  abstract =     "As the Internet of Things thrives, monitors and
                 cameras produce tons of image data every day. To
                 efficiently process these images, many compressed
                 imaging frameworks are proposed. A compressed imaging
                 framework comprises two parts, image signal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qi:2021:GNT,
  author =       "Lei Qi and Lei Wang and Jing Huo and Yinghuan Shi and
                 Yang Gao",
  title =        "{GreyReID}: a Novel Two-stream Deep Framework with
                 {RGB}-grey Information for Person Re-identification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "27:1--27:22",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3419439",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3419439",
  abstract =     "In this article, we observe that most false positive
                 images (i.e., different identities with query images)
                 in the top ranking list usually have the similar color
                 information with the query image in person
                 re-identification (Re-ID). Meanwhile, when we use
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chehabeddine:2021:BMH,
  author =       "Said Chehabeddine and Muhammad Hassan Jamil and Wanjoo
                 Park and Dianne L. Sefo and Peter M. Loomer and Mohamad
                 Eid",
  title =        "{Bi}-manual Haptic-based Periodontal Simulation with
                 Finger Support and Vibrotactile Feedback",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "28:1--28:17",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3421765",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3421765",
  abstract =     "The rise of virtual reality and haptic technologies
                 has created exciting new applications in medical
                 training and education. In a dental simulation, haptic
                 technology can create the illusion of substances
                 (teeth, gingiva, bone, etc.) by providing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:MHP,
  author =       "Jianshu Li and Jian Zhao and Congyan Lang and Yidong
                 Li and Yunchao Wei and Guodong Guo and Terence Sim and
                 Shuicheng Yan and Jiashi Feng",
  title =        "Multi-human Parsing with a Graph-based Generative
                 Adversarial Model",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "29:1--29:21",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418217",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418217",
  abstract =     "Human parsing is an important task in human-centric
                 image understanding in computer vision and multimedia
                 systems. However, most existing works on human parsing
                 mainly tackle the single-person scenario, which
                 deviates from real-world applications where \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cinar:2021:IJB,
  author =       "Yusuf Cinar and Peter Pocta and Desmond Chambers and
                 Hugh Melvin",
  title =        "Improved Jitter Buffer Management for {WebRTC}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "30:1--30:20",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410449",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410449",
  abstract =     "This work studies the jitter buffer management
                 algorithm for Voice over IP in WebRTC. In particular,
                 it details the core concepts of WebRTC's jitter buffer
                 management. Furthermore, it investigates how jitter
                 buffer management algorithm behaves under \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Czekierda:2021:AOO,
  author =       "Lukasz Czekierda and Krzysztof Zieli{\'n}ski and
                 S{\l}awomir Zieli{\'n}ski",
  title =        "Automated Orchestration of Online Educational
                 Collaboration in Cloud-based Environments",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "31:1--31:26",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3412381",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3412381",
  abstract =     "Integrated collaboration environments (ICEs) are
                 widely used by corporations to increase productivity by
                 fostering groupwide and interpersonal collaboration. In
                 this article, we discuss the enhancements of such
                 environment needed to build an educational \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kieu:2021:BLD,
  author =       "My Kieu and Andrew D. Bagdanov and Marco Bertini",
  title =        "Bottom-up and Layerwise Domain Adaptation for
                 Pedestrian Detection in Thermal Images",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "32:1--32:19",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418213",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418213",
  abstract =     "Pedestrian detection is a canonical problem for safety
                 and security applications, and it remains a challenging
                 problem due to the highly variable lighting conditions
                 in which pedestrians must be detected. This article
                 investigates several domain \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:MIH,
  author =       "Wenjie Wang and Ling-Yu Duan and Hao Jiang and
                 Peiguang Jing and Xuemeng Song and Liqiang Nie",
  title =        "{Market$2$Dish}: Health-aware Food Recommendation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "33:1--33:19",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418211",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418211",
  abstract =     "With the rising incidence of some diseases, such as
                 obesity and diabetes, the healthy diet is arousing
                 increasing attention. However, most existing
                 food-related research efforts focus on recipe
                 retrieval, user-preference-based food recommendation,
                 cooking \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:ADA,
  author =       "Yiding Liu and Siyu Yang and Bin Li and Wengang Zhou
                 and Jizheng Xu and Houqiang Li and Yan Lu",
  title =        "Affinity Derivation for Accurate Instance
                 Segmentation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "34:1--34:20",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3407090",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3407090",
  abstract =     "Affinity, which represents whether two pixels belong
                 to a same instance, is an equivalent representation to
                 the instance segmentation labels. Conventional works do
                 not make an explicit exploration on the affinity. In
                 this article, we present two instance \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yu:2021:CLG,
  author =       "Yi Yu and Abhishek Srivastava and Simon Canales",
  title =        "Conditional {LSTM-GAN} for Melody Generation from
                 Lyrics",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "35:1--35:20",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3424116",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3424116",
  abstract =     "Melody generation from lyrics has been a challenging
                 research issue in the field of artificial intelligence
                 and music, which enables us to learn and discover
                 latent relationships between interesting lyrics and
                 accompanying melodies. Unfortunately, the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2021:AWE,
  author =       "Xin Yang and Xuemeng Song and Fuli Feng and Haokun Wen
                 and Ling-Yu Duan and Liqiang Nie",
  title =        "Attribute-wise Explainable Fashion Compatibility
                 Modeling",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "36:1--36:21",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3425636",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3425636",
  abstract =     "With the boom of the fashion market and people's daily
                 needs for beauty, clothing matching has gained
                 increased research attention. In a sense, tackling this
                 problem lies in modeling the human notions of the
                 compatibility between fashion items, i.e., \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:SSL,
  author =       "Zhixin Li and Lan Lin and Canlong Zhang and Huifang Ma
                 and Weizhong Zhao and Zhiping Shi",
  title =        "A Semi-supervised Learning Approach Based on Adaptive
                 Weighted Fusion for Automatic Image Annotation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "37:1--37:23",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3426974",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3426974",
  abstract =     "To learn a well-performed image annotation model, a
                 large number of labeled samples are usually required.
                 Although the unlabeled samples are readily available
                 and abundant, it is a difficult task for humans to
                 annotate large numbers of images manually. In
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:DVV,
  author =       "Yanwei Liu and Jinxia Liu and Antonios Argyriou and
                 Siwei Ma and Liming Wang and Zhen Xu",
  title =        "$ 360$-Degree {VR} Video Watermarking Based on
                 Spherical Wavelet Transform",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1",
  pages =        "38:1--38:23",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3425605",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:40:21 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3425605",
  abstract =     "Similar to conventional video, the increasingly
                 popular 360 virtual reality (VR) video requires
                 copyright protection mechanisms. The classic approach
                 for copyright protection is the introduction of a
                 digital watermark into the video sequence. Due to the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:IBM,
  author =       "Yang Wang and Meng Fang and Joey Tianyi Zhou and
                 Tingting Mu and Dacheng Tao",
  title =        "Introduction to Big Multimodal Multimedia Data with
                 Deep Analytics",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "1:1--1:3",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447530",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447530",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:ZSC,
  author =       "Xing Xu and Jialin Tian and Kaiyi Lin and Huimin Lu
                 and Jie Shao and Heng Tao Shen",
  title =        "Zero-shot Cross-modal Retrieval by Assembling
                 {AutoEncoder} and Generative Adversarial Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "3:1--3:17",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3424341",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3424341",
  abstract =     "Conventional cross-modal retrieval models mainly
                 assume the same scope of the classes for both the
                 training set and the testing set. This assumption
                 limits their extensibility on zero-shot cross-modal
                 retrieval (ZS-CMR), \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fu:2021:DGL,
  author =       "Sichao Fu and Weifeng Liu and Weili Guan and Yicong
                 Zhou and Dapeng Tao and Changsheng Xu",
  title =        "Dynamic Graph Learning Convolutional Networks for
                 Semi-supervised Classification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "4:1--4:13",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3412846",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3412846",
  abstract =     "Over the past few years, graph representation learning
                 (GRL) has received widespread attention on the feature
                 representations of the non-Euclidean data. As a typical
                 model of GRL, graph convolutional networks \ldots{}
                 (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:DNP,
  author =       "Zhao Zhang and Jiahuan Ren and Haijun Zhang and Zheng
                 Zhang and Guangcan Liu and Shuicheng Yan",
  title =        "{DLRF-Net}: a Progressive Deep Latent Low-Rank Fusion
                 Network for Hierarchical Subspace Discovery",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "5:1--5:24",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3402030",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3402030",
  abstract =     "Low-rank coding-based representation learning is
                 powerful for discovering and recovering the subspace
                 structures in data, which has obtained an impressive
                 performance; however, it still cannot obtain deep
                 hidden \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:GMM,
  author =       "Yi Zhang and Miaomiao Li and Siwei Wang and Sisi Dai
                 and Lei Luo and En Zhu and Huiying Xu and Xinzhong Zhu
                 and Chaoyun Yao and Haoran Zhou",
  title =        "{Gaussian} Mixture Model Clustering with Incomplete
                 Data",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "6:1--6:14",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408318",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408318",
  abstract =     "Gaussian mixture model (GMM) clustering has been
                 extensively studied due to its effectiveness and
                 efficiency. Though demonstrating promising performance
                 in various applications, it cannot effectively address
                 the \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:ROR,
  author =       "Jing Zhang and Jiaqi Guo and Yonggong Ren",
  title =        "Robust Ordinal Regression: User Credit Grading with
                 Triplet Loss-Based Sampling",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "7:1--7:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408303",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408303",
  abstract =     "With the development of social media sites, user
                 credit grading, which served as an important and
                 fashionable problem, has attracted substantial
                 attention from a slew of developers and operators of
                 mobile applications. \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:EIE,
  author =       "Xin Xu and Shiqin Wang and Zheng Wang and Xiaolong
                 Zhang and Ruimin Hu",
  title =        "Exploring Image Enhancement for Salient Object
                 Detection in Low Light Images",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "8:1--8:19",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3414839",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3414839",
  abstract =     "Low light images captured in a non-uniform
                 illumination environment usually are degraded with the
                 scene depth and the corresponding environment lights.
                 This degradation results in severe object information
                 loss in the \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:LSI,
  author =       "Yanchun Li and Jianglian Cao and Zhetao Li and
                 Sangyoon Oh and Nobuyoshi Komuro",
  title =        "Lightweight Single Image Super-resolution with Dense
                 Connection Distillation Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "9:1--9:17",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3414838",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3414838",
  abstract =     "Single image super-resolution attempts to reconstruct
                 a high-resolution (HR) image from its corresponding
                 low-resolution (LR) image, which has been a research
                 hotspot in computer vision and image processing for
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:SDM,
  author =       "Yang Wang",
  title =        "Survey on Deep Multi-modal Data Analytics:
                 Collaboration, Rivalry, and Fusion",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "10:1--10:25",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408317",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408317",
  abstract =     "With the development of web technology, multi-modal or
                 multi-view data has surged as a major stream for big
                 data, where each modal/view encodes individual property
                 of data objects. Often, different modalities are
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:ISI,
  author =       "Yang Wang and Meng Fang and Joey Tianyi Zhou and
                 Tingting Mu and Dacheng Tao",
  title =        "Introduction to the Special Issue on Fine-grained
                 Visual Computing",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "11:1--11:3",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447532",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447532",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2021:AEN,
  author =       "Yutao Hu and Xuhui Liu and Baochang Zhang and Jungong
                 Han and Xianbin Cao",
  title =        "Alignment Enhancement Network for Fine-grained Visual
                 Categorization",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "12:1--12:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446208",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446208",
  abstract =     "Fine-grained visual categorization (FGVC) aims to
                 automatically recognize objects from different
                 sub-ordinate categories. Despite attracting
                 considerable attention from both academia and industry,
                 it remains a \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guan:2021:UPS,
  author =       "Weili Guan and Zhaozheng Chen and Fuli Feng and
                 Weifeng Liu and Liqiang Nie",
  title =        "Urban Perception: Sensing Cities via a Deep
                 Interactive Multi-task Learning Framework",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "13:1--13:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3424115",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3424115",
  abstract =     "Social scientists have shown evidence that visual
                 perceptions of urban attributes, such as safe, wealthy,
                 and beautiful perspectives of the given cities, are
                 highly correlated to the residents' behaviors and
                 quality \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lu:2021:CIC,
  author =       "Huimin Lu and Rui Yang and Zhenrong Deng and Yonglin
                 Zhang and Guangwei Gao and Rushi Lan",
  title =        "{Chinese} Image Captioning via Fuzzy Attention-based
                 {DenseNet-BiLSTM}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "14:1--14:18",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3422668",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3422668",
  abstract =     "Chinese image description generation tasks usually
                 have some challenges, such as single-feature
                 extraction, lack of global information, and lack of
                 detailed description of the image content. To address
                 these limitations, we \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiao:2021:WSS,
  author =       "Junsheng Xiao and Huahu Xu and Honghao Gao and Minjie
                 Bian and Yang Li",
  title =        "A Weakly Supervised Semantic Segmentation Network by
                 Aggregating Seed Cues: The Multi-Object Proposal
                 Generation Perspective",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "15:1--15:19",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3419842",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3419842",
  abstract =     "Weakly supervised semantic segmentation under
                 image-level annotations is effectiveness for real-world
                 applications. The small and sparse discriminative
                 regions obtained from an image classification network
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:RMR,
  author =       "Chao Zhang and Xiaopei Wu and Jianchao Lu and Xi Zheng
                 and Alireza Jolfaei and Quan Z. Sheng and Dongjin Yu",
  title =        "{RICA-MD}: a Refined {ICA} Algorithm for Motion
                 Detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "17:1--17:17",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3416492",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3416492",
  abstract =     "With the rapid development of various computing
                 technologies, the constraints of data processing
                 capabilities gradually disappeared, and more data can
                 be simultaneously processed to obtain better \ldots{}
                 (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rahman:2021:MMP,
  author =       "MD Abdur Rahman and M. Shamim Hossain and Nabil A.
                 Alrajeh and B. B. Gupta",
  title =        "A Multimodal, Multimedia Point-of-Care Deep Learning
                 Framework for {COVID-19} Diagnosis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "18:1--18:24",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3421725",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3421725",
  abstract =     "In this article, we share our experiences in designing
                 and developing a suite of deep neural network-(DNN)
                 based COVID-19 case detection and recognition
                 framework. Existing pathological tests such as
                 RT-PCR-based \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:SFF,
  author =       "Yidong Li and Wenhua Liu and Yi Jin and Yuanzhouhan
                 Cao",
  title =        "{SPGAN}: Face Forgery Using Spoofing Generative
                 Adversarial Networks",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "19:1--19:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3432817",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3432817",
  abstract =     "Current face spoof detection schemes mainly rely on
                 physiological cues such as eye blinking, mouth
                 movements, and micro-expression changes, or textural
                 attributes of the face images [9]. But none of these
                 methods \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qi:2021:CAW,
  author =       "Lianyong Qi and Houbing Song and Xuyun Zhang and
                 Gautam Srivastava and Xiaolong Xu and Shui Yu",
  title =        "Compatibility-Aware {Web} {API} Recommendation for
                 Mashup Creation via Textual Description Mining",
  journal =      j-TOMM,
  volume =       "17",
  number =       "1s",
  pages =        "20:1--20:19",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3417293",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Apr 17 08:50:01 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3417293",
  abstract =     "With the ever-increasing prosperity of web Application
                 Programming Interface (API) sharing platforms, it is
                 becoming an economic and efficient way for software
                 developers to design their interested mashups \ldots{}
                 (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Krishnan:2021:SEQ,
  author =       "Prabhakar Krishnan and Kurunandan Jain and Pramod
                 George Jose and Krishnashree Achuthan and Rajkumar
                 Buyya",
  title =        "{SDN} Enabled {QoE} and Security Framework for
                 Multimedia Applications in {5G} Networks",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "39:1--39:29",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377390",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3377390",
  abstract =     "The technologies for real-time multimedia transmission
                 and immersive 3D gaming applications are rapidly
                 emerging, posing challenges in terms of performance,
                 security, authentication, data privacy, and encoding.
                 The communication channel for these \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kumar:2021:ESE,
  author =       "S. Sambath Kumar and M. Nandhini",
  title =        "Entropy Slicing Extraction and Transfer Learning
                 Classification for Early Diagnosis of {Alzheimer}
                 Diseases with {sMRI}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "40:1--40:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3383749",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3383749",
  abstract =     "Alzheimer's Disease (AD) is an irreversible
                 neurogenerative disorder that undergoes progressive
                 decline in memory and cognitive function and is
                 characterized by structural brain Magnetic Resonance
                 Images (sMRI). In recent years, sMRI data has played a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:TTF,
  author =       "Xiaolong Xu and Zijie Fang and Lianyong Qi and Xuyun
                 Zhang and Qiang He and Xiaokang Zhou",
  title =        "{TripRes}: Traffic Flow Prediction Driven Resource
                 Reservation for Multimedia {IoV} with Edge Computing",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "41:1--41:21",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3401979",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3401979",
  abstract =     "The Internet of Vehicles (IoV) connects vehicles,
                 roadside units (RSUs) and other intelligent objects,
                 enabling data sharing among them, thereby improving the
                 efficiency of urban traffic and safety. Currently,
                 collections of multimedia content, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2021:FDI,
  author =       "Wei Liang and Jing Long and Kuan-Ching Li and Jianbo
                 Xu and Nanjun Ma and Xia Lei",
  title =        "A Fast Defogging Image Recognition Algorithm Based on
                 Bilateral Hybrid Filtering",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "42:1--42:16",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391297",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3391297",
  abstract =     "With the rapid advancement of video and image
                 processing technologies in the Internet of Things, it
                 is urgent to address the issues in real-time
                 performance, clarity, and reliability of image
                 recognition technology for a monitoring system in foggy
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tong:2021:IPP,
  author =       "Chao Tong and Mengze Zhang and Chao Lang and Zhigao
                 Zheng",
  title =        "An Image Privacy Protection Algorithm Based on
                 Adversarial Perturbation Generative Networks",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "43:1--43:14",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3381088",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3381088",
  abstract =     "Today, users of social platforms upload a large number
                 of photos. These photos contain personal private
                 information, including user identity information, which
                 is easily gleaned by intelligent detection algorithms.
                 To thwart this, in this work, we \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fu:2021:FAA,
  author =       "Yunfei Fu and Hongchuan Yu and Chih-Kuo Yeh and
                 Tong-Yee Lee and Jian J. Zhang",
  title =        "Fast Accurate and Automatic Brushstroke Extraction",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "44:1--44:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3429742",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3429742",
  abstract =     "Brushstrokes are viewed as the artist's
                 ``handwriting'' in a painting. In many applications
                 such as style learning and transfer, mimicking
                 painting, and painting authentication, it is highly
                 desired to quantitatively and accurately identify
                 brushstroke \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{K:2021:AML,
  author =       "Mythili K. and Manish Narwaria",
  title =        "Assessment of Machine Learning-Based Audiovisual
                 Quality Predictors: Why Uncertainty Matters",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "45:1--45:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3430376",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3430376",
  abstract =     "Quality assessment of audiovisual (AV) signals is
                 important from the perspective of system design,
                 optimization, and management of a modern multimedia
                 communication system. However, automatic prediction of
                 AV quality via the use of computational models
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hama:2021:EUM,
  author =       "Kenta Hama and Takashi Matsubara and Kuniaki Uehara
                 and Jianfei Cai",
  title =        "Exploring Uncertainty Measures for Image-caption
                 Embedding-and-retrieval Task",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "46:1--46:19",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3425663",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3425663",
  abstract =     "With the significant development of black-box machine
                 learning algorithms, particularly deep neural networks,
                 the practical demand for reliability assessment is
                 rapidly increasing. On the basis of the concept that
                 ``Bayesian deep learning knows what it \ldots{}''",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nguyen:2021:ISV,
  author =       "Phuong-Anh Nguyen and Chong-Wah Ngo",
  title =        "Interactive Search vs. Automatic Search: an Extensive
                 Study on Video Retrieval",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "47:1--47:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3429457",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3429457",
  abstract =     "This article conducts user evaluation to study the
                 performance difference between interactive and
                 automatic search. Particularly, the study aims to
                 provide empirical insights of how the performance
                 landscape of video search changes, with tens of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:TAE,
  author =       "Yang Li and Guangcan Liu and Yubao Sun and Qingshan
                 Liu and Shengyong Chen",
  title =        "{$3$D} Tensor Auto-encoder with Application to Video
                 Compression",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "48:1--48:18",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3431768",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3431768",
  abstract =     "Auto-encoder has been widely used to compress
                 high-dimensional data such as the images and videos.
                 However, the traditional auto-encoder network needs to
                 store a large number of parameters. Namely, when the
                 input data is of dimension n, the number of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mehrabi:2021:MTC,
  author =       "Abbas Mehrabi and Matti Siekkinen and Teemu
                 K{\"a}m{\"a}r{\"a}inen and Antti
                 Yl{\"a}-J{\"a}{\"a}ski",
  title =        "Multi-Tier {CloudVR}: Leveraging Edge Computing in
                 Remote Rendered Virtual Reality",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "49:1--49:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3429441",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3429441",
  abstract =     "The availability of high bandwidth with low-latency
                 communication in 5G mobile networks enables remote
                 rendered real-time virtual reality (VR) applications.
                 Remote rendering of VR graphics in a cloud removes the
                 need for local personal computer for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2021:ARO,
  author =       "Lu Sun and Hussein {Al Osman} and Jochen Lang",
  title =        "An Augmented Reality Online Assistance Platform for
                 Repair Tasks",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "50:1--50:23",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3429285",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3429285",
  abstract =     "Our augmented reality online assistance platform
                 enables an expert to specify 6DoF movements of a
                 component and apply the geometrical and physical
                 constraints in real-time. We track the real components
                 on the expert's side to monitor the operations of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2021:SAM,
  author =       "Meiqi Zhao and Jianmin Zheng and Elvis S. Liu",
  title =        "Server Allocation for Massively Multiplayer Online
                 Cloud Games Using Evolutionary Optimization",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "51:1--51:23",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433027",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433027",
  abstract =     "In recent years, Massively Multiplayer Online Games
                 (MMOGs) are becoming popular, partially due to their
                 sophisticated graphics and broad virtual world, and
                 cloud gaming is demanded more than ever especially when
                 entertaining with light and portable \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wei:2021:ISS,
  author =       "Haiyang Wei and Zhixin Li and Feicheng Huang and
                 Canlong Zhang and Huifang Ma and Zhongzhi Shi",
  title =        "Integrating Scene Semantic Knowledge into Image
                 Captioning",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "52:1--52:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3439734",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3439734",
  abstract =     "Most existing image captioning methods use only the
                 visual information of the image to guide the generation
                 of captions, lack the guidance of effective scene
                 semantic information, and the current visual attention
                 mechanism cannot adjust the focus \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gupta:2021:VSB,
  author =       "Shikha Gupta and Krishan Sharma and Dileep Aroor
                 Dinesh and Veena Thenkanidiyoor",
  title =        "Visual Semantic-Based Representation Learning Using
                 Deep {CNNs} for Scene Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "53:1--53:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3436494",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436494",
  abstract =     "In this work, we address the task of scene recognition
                 from image data. A scene is a spatially correlated
                 arrangement of various visual semantic contents also
                 known as concepts, e.g., ``chair,'' ``car,'' ``sky,''
                 etc. Representation learning using visual \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2021:PCN,
  author =       "Chun-ying Huang and Yun-chen Cheng and Guan-zhang
                 Huang and Ching-ling Fan and Cheng-hsin Hsu",
  title =        "On the Performance Comparisons of Native and
                 Clientless Real-Time Screen-Sharing Technologies",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "54:1--54:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3437881",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3437881",
  abstract =     "Real-time screen-sharing provides users with
                 ubiquitous access to remote applications, such as
                 computer games, movie players, and desktop applications
                 (apps), anywhere and anytime. In this article, we study
                 the performance of different screen-sharing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2021:ACG,
  author =       "Xin Yang and Zongliang Ma and Letian Yu and Ying Cao
                 and Baocai Yin and Xiaopeng Wei and Qiang Zhang and
                 Rynson W. H. Lau",
  title =        "Automatic Comic Generation with Stylistic Multi-page
                 Layouts and Emotion-driven Text Balloon Generation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "55:1--55:19",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3440053",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440053",
  abstract =     "In this article, we propose a fully automatic system
                 for generating comic books from videos without any
                 human intervention. Given an input video along with its
                 subtitles, our approach first extracts informative
                 keyframes by analyzing the subtitles and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sharma:2021:HQF,
  author =       "Prasen Kumar Sharma and Sujoy Ghosh and Arijit Sur",
  title =        "High-quality Frame Recurrent Video De-raining with
                 Multi-contextual Adversarial Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "56:1--56:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3444974",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3444974",
  abstract =     "In this article, we address the problem of rain-streak
                 removal in the videos. Unlike the image, challenges in
                 video restoration comprise temporal consistency besides
                 spatial enhancement. The researchers across the world
                 have proposed several effective \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lan:2021:STR,
  author =       "Xiangyuan Lan and Zifei Yang and Wei Zhang and Pong C.
                 Yuen",
  title =        "Spatial-temporal Regularized Multi-modality
                 Correlation Filters for Tracking with Re-detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2",
  pages =        "57:1--57:16",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3430257",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jun 5 07:35:45 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3430257",
  abstract =     "The development of multi-spectrum image sensing
                 technology has brought great interest in exploiting the
                 information of multiple modalities (e.g., RGB and
                 infrared modalities) for solving computer vision
                 problems. In this article, we investigate how to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2021:ISI,
  author =       "Amit Kumar Singh and Zhihan Lv and Hoon Ko",
  title =        "Introduction to the Special Issue on {Recent Trends in
                 Medical Data Security for e-Health Applications}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "58:1--58:3",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3459601",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3459601",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2021:SHD,
  author =       "A. K. Singh and A. Anand and Z. Lv and H. Ko and A.
                 Mohan",
  title =        "A Survey on Healthcare Data: a Security Perspective",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "59:1--59:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3422816",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3422816",
  abstract =     "With the remarkable development of internet
                 technologies, the popularity of smart healthcare has
                 regularly come to the fore. Smart healthcare uses
                 advanced technologies to transform the traditional
                 medical system in an \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2021:SPP,
  author =       "Hongjiao Wu and Ashutosh Dhar Dwivedi and Gautam
                 Srivastava",
  title =        "Security and Privacy of Patient Information in Medical
                 Systems Based on Blockchain Technology",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "60:1--60:17",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3408321",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3408321",
  abstract =     "The essence of ``blockchain'' is a shared database in
                 which information stored is un-falsifiable, traceable,
                 open, and transparent. Therefore, to improve the
                 security of private information in medical systems,
                 this article \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:OBR,
  author =       "Ting Wang and Xiangjun Ji and Aiguo Song and Kurosh
                 Madani and Amine Chohra and Huimin Lu and Ramon
                 Monero",
  title =        "Output-Bounded and {RBFNN}-Based Position Tracking and
                 Adaptive Force Control for Security Tele-Surgery",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "61:1--61:15",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3394920",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3394920",
  abstract =     "In security e-health brain neurosurgery, one of the
                 important processes is to move the electrocoagulation
                 to the appropriate position in order to excavate the
                 diseased tissue.$^1$ However, it has been problematic
                 for \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alkhariji:2021:SPD,
  author =       "Lamya Alkhariji and Nada Alhirabi and Mansour Naser
                 Alraja and Mahmoud Barhamgi and Omer Rana and Charith
                 Perera",
  title =        "Synthesising Privacy by Design Knowledge Toward
                 Explainable {Internet of Things} Application Designing
                 in Healthcare",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "62:1--62:29",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434186",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434186",
  abstract =     "Privacy by Design (PbD) is the most common approach
                 followed by software developers who aim to reduce risks
                 within their application designs, yet it remains
                 commonplace for developers to retain little \ldots{}
                 (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tanveer:2021:PLT,
  author =       "M. Tanveer and Tarun Gupta and Miten Shah and {For the
                 Alzheimer's Disease Neuroimaging Initiative}",
  title =        "Pinball Loss Twin Support Vector Clustering",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "63:1--63:23",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409264",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409264",
  abstract =     "Twin Support Vector Clustering (TWSVC) is a clustering
                 algorithm inspired by the principles of Twin Support
                 Vector Machine (TWSVM). TWSVC has already outperformed
                 other traditional plane based clustering algorithms.
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sahu:2021:LMP,
  author =       "Amiya Kumar Sahu and Suraj Sharma and Deepak Puthal",
  title =        "Lightweight Multi-party Authentication and Key
                 Agreement Protocol in {IoT}-based E-Healthcare
                 Service",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "64:1--64:20",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398039",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3398039",
  abstract =     "Internet of Things (IoT) is playing a promising role
                 in e-healthcare applications in the recent decades;
                 nevertheless, security is one of the crucial challenges
                 in the current field of study. Many healthcare devices
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rajput:2021:SBS,
  author =       "Amitesh Singh Rajput and Vishesh Kumar Tanwar and
                 Balasubramanian Raman",
  title =        "-Score-Based Secure Biomedical Model for Effective
                 Skin Lesion Segmentation Over {eHealth} Cloud",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "65:1--65:19",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3430806",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3430806",
  abstract =     "This study aims to process the private medical data
                 over eHealth cloud platform. The current pandemic
                 situation, caused by Covid19 has made us to realize the
                 importance of automatic remotely operated independent
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2021:EEB,
  author =       "Ashima Singh and Arwinder Dhillon and Neeraj Kumar and
                 M. Shamim Hossain and Ghulam Muhammad and Manoj Kumar",
  title =        "{eDiaPredict}: an Ensemble-based Framework for
                 Diabetes Prediction",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "66:1--66:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3415155",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3415155",
  abstract =     "Medical systems incorporate modern computational
                 intelligence in healthcare. Machine learning techniques
                 are applied to predict the onset and reoccurrence of
                 the disease, identify biomarkers for survivability
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Amato:2021:SPV,
  author =       "Flora Amato and Valentina Casola and Giovanni
                 Cozzolino and Alessandra {De Benedictis} and Nicola
                 Mazzocca and Francesco Moscato",
  title =        "A Security and Privacy Validation Methodology for
                 e-Health Systems",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "67:1--67:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3412373",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3412373",
  abstract =     "e-Health applications enable one to acquire, process,
                 and share patient medical data to improve diagnosis,
                 treatment, and patient monitoring. Despite the
                 undeniable benefits brought by the digitization of
                 health systems, the \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kasyap:2021:PPD,
  author =       "Harsh Kasyap and Somanath Tripathy",
  title =        "Privacy-preserving Decentralized Learning Framework
                 for Healthcare System",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "68:1--68:24",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3426474",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3426474",
  abstract =     "Clinical trials and drug discovery would not be
                 effective without the collaboration of institutions.
                 Earlier, it has been at the cost of individual's
                 privacy. Several pacts and compliances have been
                 enforced to avoid data \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shamsolmoali:2021:ISI,
  author =       "Pourya Shamsolmoali and Ruili Wang and A. H. Sadka",
  title =        "Introduction to the Special Issue on {Advanced
                 Approaches for Multiple Instance Learning on Multimedia
                 Applications}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "69:1--69:2",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3459603",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3459603",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ji:2021:MPG,
  author =       "Ruyi Ji and Zeyu Liu and Libo Zhang and Jianwei Liu
                 and Xin Zuo and Yanjun Wu and Chen Zhao and Haofeng
                 Wang and Lin Yang",
  title =        "Multi-peak Graph-based Multi-instance Learning for
                 Weakly Supervised Object Detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "70:1--70:21",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3432861",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3432861",
  abstract =     "Weakly supervised object detection (WSOD), aiming to
                 detect objects with only image-level annotations, has
                 become one of the research hotspots over the past few
                 years. Recently, much effort has been devoted to
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ding:2021:MSA,
  author =       "Yaoling Ding and Liehuang Zhu and An Wang and Yuan Li
                 and Yongjuan Wang and Siu Ming Yiu and Keke Gai",
  title =        "A Multiple Sieve Approach Based on Artificial
                 Intelligent Techniques and Correlation Power Analysis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "71:1--71:21",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433165",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433165",
  abstract =     "Side-channel analysis achieves key recovery by
                 analyzing physical signals generated during the
                 operation of cryptographic devices. Power consumption
                 is one kind of these signals and can be regarded as a
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ji:2021:MIM,
  author =       "Wanting Ji and Ruili Wang",
  title =        "A Multi-instance Multi-label Dual Learning Approach
                 for Video Captioning",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "72:1--72:18",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446792",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446792",
  abstract =     "Video captioning is a challenging task in the field of
                 multimedia processing, which aims to generate
                 informative natural language descriptions/captions to
                 describe video contents. Previous video captioning
                 approaches \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zareapoor:2021:EAN,
  author =       "Masoumeh Zareapoor and Jie Yang",
  title =        "Equivariant Adversarial Network for Image-to-image
                 Translation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "73:1--73:14",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458280",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458280",
  abstract =     "Image-to-Image translation aims to learn an image from
                 a source domain to a target domain. However, there are
                 three main challenges, such as lack of paired datasets,
                 multimodality, and diversity, that are associated
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mohammed:2021:MAF,
  author =       "Mazin Abed Mohammed and Mohamed Elhoseny and Karrar
                 Hameed Abdulkareem and Salama A. Mostafa and Mashael S.
                 Maashi",
  title =        "A Multi-agent Feature Selection and Hybrid
                 Classification Model for {Parkinson}'s Disease
                 Diagnosis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "74:1--74:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433180",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433180",
  abstract =     "Parkinson's disease (PD) diagnostics includes numerous
                 analyses related to the neurological, physical, and
                 psychical status of the patient. Medical teams analyze
                 multiple symptoms and patient history considering
                 \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{An:2021:MTU,
  author =       "Na An and Wei Qi Yan",
  title =        "Multitarget Tracking Using {Siamese} Neural Networks",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "75:1--75:16",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441656",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441656",
  abstract =     "In this article, we detect and track visual objects by
                 using Siamese network or twin neural network. The
                 Siamese network is constructed to classify moving
                 objects based on the associations of object detection
                 network and \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2021:MCA,
  author =       "Xiaochuan Tang and Mingzhe Liu and Hao Zhong and
                 Yuanzhen Ju and Weile Li and Qiang Xu",
  title =        "{MILL}: Channel Attention-based Deep Multiple Instance
                 Learning for Landslide Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "2s",
  pages =        "76:1--76:11",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3454009",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Tue Jun 22 08:33:16 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3454009",
  abstract =     "Landslide recognition is widely used in natural
                 disaster risk management. Traditional landslide
                 recognition is mainly conducted by geologists, which is
                 accurate but inefficient. This article introduces
                 multiple instance learning \ldots{} (More)",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:NNB,
  author =       "Yue Li and Yan Yi and Dong Liu and Li Li and Zhu Li
                 and Houqiang Li",
  title =        "Neural-Network-Based Cross-Channel Intra Prediction",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "77:1--77:23",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434250",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434250",
  abstract =     "To reduce the redundancy among different color
                 channels, e.g., YUV, previous methods usually adopt a
                 linear model that tends to be oversimple for complex
                 image content. We propose a neural-network-based method
                 for cross-channel prediction in intra frame \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:MML,
  author =       "Zhandong Liu and Wengang Zhou and Houqiang Li",
  title =        "{MFECN}: Multi-level Feature Enhanced Cumulative
                 Network for Scene Text Detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "78:1--78:22",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3440087",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440087",
  abstract =     "Recently, many scene text detection algorithms have
                 achieved impressive performance by using convolutional
                 neural networks. However, most of them do not make full
                 use of the context among the hierarchical multi-level
                 features to improve the performance of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dong:2021:SCL,
  author =       "Xingbo Dong and Soohyong Kim and Zhe Jin and Jung Yeon
                 Hwang and Sangrae Cho and Andrew Beng Jin Teoh",
  title =        "Secure Chaff-less Fuzzy Vault for Face Identification
                 Systems",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "79:1--79:22",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3442198",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3442198",
  abstract =     "Biometric cryptosystems such as fuzzy vaults represent
                 one of the most popular approaches for secret and
                 biometric template protection. However, they are solely
                 designed for biometric verification, where the user is
                 required to input both identity \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2021:GLE,
  author =       "Hezhen Hu and Wengang Zhou and Junfu Pu and Houqiang
                 Li",
  title =        "Global-Local Enhancement Network for {NMF}-Aware Sign
                 Language Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "80:1--80:19",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3436754",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3436754",
  abstract =     "Sign language recognition (SLR) is a challenging
                 problem, involving complex manual features (i.e., hand
                 gestures) and fine-grained non-manual features (NMFs)
                 (i.e., facial expression, mouth shapes, etc.).
                 Although manual features are dominant, non-manual
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2021:RRN,
  author =       "Feng Lin and Wengang Zhou and Jiajun Deng and Bin Li
                 and Yan Lu and Houqiang Li",
  title =        "Residual Refinement Network with Attribute Guidance
                 for Precise Saliency Detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "81:1--81:19",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3440694",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440694",
  abstract =     "As an important topic in the multimedia and computer
                 vision fields, salient object detection has been
                 researched for years. Recently, state-of-the-art
                 performance has been witnessed with the aid of the
                 fully convolutional networks (FCNs) and the various
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2021:IIR,
  author =       "Hongdi Zheng and Junfeng Wang and Jianping Zhang and
                 Ruirui Li",
  title =        "{IRTS}: an Intelligent and Reliable Transmission
                 Scheme for Screen Updates Delivery in {DaaS}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "82:1--82:24",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3440035",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440035",
  abstract =     "Desktop-as-a-service (DaaS) has been recognized as an
                 elastic and economical solution that enables users to
                 access personal desktops from anywhere at any time.
                 During the interaction process of DaaS, users rely on
                 screen updates to perceive execution \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:SCG,
  author =       "Rui Wang and Dong Liang and Xiaochun Cao and Yuanfang
                 Guo",
  title =        "Semantic Correspondence with Geometric Structure
                 Analysis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "83:1--83:21",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441576",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441576",
  abstract =     "This article studies the correspondence problem for
                 semantically similar images, which is challenging due
                 to the joint visual and geometric deformations. We
                 introduce the Flip-aware Distance Ratio method (FDR) to
                 solve this problem from the perspective of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:SSS,
  author =       "Xinfang Liu and Xiushan Nie and Junya Teng and Li Lian
                 and Yilong Yin",
  title =        "Single-shot Semantic Matching Network for Moment
                 Localization in Videos",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "84:1--84:14",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441577",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441577",
  abstract =     "Moment localization in videos using natural language
                 refers to finding the most relevant segment from videos
                 given a natural language query. Most of the existing
                 methods require video segment candidates for further
                 matching with the query, which leads to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alaya:2021:PBD,
  author =       "Bechir Alaya",
  title =        "Payoff-based Dynamic Segment Replication and Graph
                 Classification Method with Attribute Vectors Adapted to
                 Urban {VANET}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "85:1--85:22",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3440018",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3440018",
  abstract =     "Due to the number of constraints and the dynamic
                 nature of vehicular ad hoc networks (VANET), effective
                 video broadcasting always remains a difficult task. In
                 this work, we proposed a quality of video visualization
                 guarantee model based on a feedback loop \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dhiman:2021:PWS,
  author =       "Chhavi Dhiman and Dinesh Kumar Vishwakarma and Paras
                 Agarwal",
  title =        "Part-wise Spatio-temporal Attention Driven {CNN}-based
                 {$3$D} Human Action Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "86:1--86:24",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3441628",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3441628",
  abstract =     "Recently, human activity recognition using skeleton
                 data is increasing due to its ease of acquisition and
                 finer shape details. Still, it suffers from a wide
                 range of intra-class variation, inter-class similarity
                 among the actions and view variation due to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nie:2021:PPF,
  author =       "Jie Nie and Zhi-Qiang Wei and Weizhi Nie and An-An
                 Liu",
  title =        "{PGNet}: Progressive Feature Guide Learning Network
                 for Three-dimensional Shape Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "87:1--87:17",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3443708",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3443708",
  abstract =     "Three-dimensional (3D) shape recognition is a popular
                 topic and has potential application value in the field
                 of computer vision. With the recent proliferation of
                 deep learning, various deep learning models have
                 achieved state-of-the-art performance. Among \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:VDB,
  author =       "Shiguang Liu and Huixin Wang and Xiaoli Zhang",
  title =        "Video Decolorization Based on the {CNN} and {LSTM}
                 Neural Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "88:1--88:18",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446619",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446619",
  abstract =     "Video decolorization is the process of transferring
                 three-channel color videos into single-channel
                 grayscale videos, which is essentially the
                 decolorization operation of video frames. Most existing
                 video decolorization algorithms directly apply image
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2021:DCN,
  author =       "Zhenzhen Yang and Pengfei Xu and Yongpeng Yang and
                 Bing-Kun Bao",
  title =        "A Densely Connected Network Based on {U-Net} for
                 Medical Image Segmentation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "89:1--89:14",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446618",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446618",
  abstract =     "The U-Net has become the most popular structure in
                 medical image segmentation in recent years. Although
                 its performance for medical image segmentation is
                 outstanding, a large number of experiments demonstrate
                 that the classical U-Net network architecture
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:LCF,
  author =       "Donglin Zhang and Xiao-Jun Wu and Jun Yu",
  title =        "Label Consistent Flexible Matrix Factorization Hashing
                 for Efficient Cross-modal Retrieval",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "90:1--90:18",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446774",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446774",
  abstract =     "Hashing methods have sparked a great revolution on
                 large-scale cross-media search due to its effectiveness
                 and efficiency. Most existing approaches learn unified
                 hash representation in a common Hamming space to
                 represent all multimodal data. However, the unified
                 hash codes may not characterize the cross-modal data
                 discriminatively, because the data may vary greatly due
                 to its different dimensionalities, physical properties,
                 and statistical information. In addition, most existing
                 supervised cross-modal algorithms preserve the
                 similarity relationship by constructing an $ n \time n
                 $ pairwise similarity matrix, which requires a large
                 amount of calculation and loses the category
                 information. To mitigate these issues, a novel
                 cross-media hashing approach is proposed in this
                 article, dubbed label flexible matrix factorization
                 hashing (LFMH). Specifically, LFMH jointly learns the
                 modality-specific latent subspace with similar semantic
                 by the flexible matrix factorization. In addition, LFMH
                 guides the hash learning by utilizing the semantic
                 labels directly instead of the large $ n \times n $
                 pairwise similarity matrix. LFMH transforms the
                 heterogeneous data into modality-specific latent
                 semantic representation. Therefore, we can obtain the
                 hash codes by quantifying the representations, and the
                 learned hash codes are consistent with the supervised
                 labels of multimodal data. Then, we can obtain the
                 similar binary codes of the corresponding modality, and
                 the binary codes can characterize such samples
                 flexibly. Accordingly, the derived hash codes have more
                 discriminative power for single-modal and cross-modal
                 retrieval tasks. Extensive experiments on eight
                 different databases demonstrate that our model
                 outperforms some competitive approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lokoc:2021:RIS,
  author =       "Jakub Lokoc and Patrik Vesel{\'y} and Frantisek
                 Mejzl{\'\i}k and Gregor Kovalc{\'\i}k and Tom{\'a}s
                 Soucek and Luca Rossetto and Klaus Schoeffmann and
                 Werner Bailer and Cathal Gurrin and Loris Sauter and
                 Jaeyub Song and Stefanos Vrochidis and Jiaxin Wu and
                 Bj{\"o}rn {\thorn}{\'o}R J{\'o}nsson",
  title =        "Is the Reign of Interactive Search Eternal? {Findings}
                 from the {Video Browser Showdown 2020}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "91:1--91:26",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3445031",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3445031",
  abstract =     "Comprehensive and fair performance evaluation of
                 information retrieval systems represents an essential
                 task for the current information age. Whereas
                 Cranfield-based evaluations with benchmark datasets
                 support development of retrieval models, significant
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:LIR,
  author =       "Qianli Xu and Ana Garcia {Del Molino} and Jie Lin and
                 Fen Fang and Vigneshwaran Subbaraju and Liyuan Li and
                 Joo-Hwee Lim",
  title =        "Lifelog Image Retrieval Based on Semantic Relevance
                 Mapping",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "92:1--92:18",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446209",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446209",
  abstract =     "Lifelog analytics is an emerging research area with
                 technologies embracing the latest advances in machine
                 learning, wearable computing, and data analytics.
                 However, state-of-the-art technologies are still
                 inadequate to distill voluminous multimodal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Du:2021:RTE,
  author =       "Gaoming Du and Jiting Wu and Hongfang Cao and Kun Xing
                 and Zhenmin Li and Duoli Zhang and Xiaolei Wang",
  title =        "A Real-Time Effective Fusion-Based Image Defogging
                 Architecture on {FPGA}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "93:1--93:21",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446241",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446241",
  abstract =     "Foggy weather reduces the visibility of photographed
                 objects, causing image distortion and decreasing
                 overall image quality. Many approaches (e.g., image
                 restoration, image enhancement, and fusion-based
                 methods) have been proposed to work out the problem.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2021:FRS,
  author =       "Chenglizhao Chen and Hongmeng Zhao and Huan Yang and
                 Teng Yu and Chong Peng and Hong Qin",
  title =        "Full-reference Screen Content Image Quality Assessment
                 by Fusing Multilevel Structure Similarity",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "94:1--94:21",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447393",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447393",
  abstract =     "Screen content images (SCIs) usually comprise various
                 content types with sharp edges, in which artifacts or
                 distortions can be effectively sensed by a vanilla
                 structure similarity measurement in a full-reference
                 manner. Nonetheless, almost all of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:DBS,
  author =       "Honglin Li and Xiaoyang Mao and Mengdi Xu and Xiaogang
                 Jin",
  title =        "Deep-based Self-refined Face-top Coordination",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "95:1--95:23",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446970",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446970",
  abstract =     "Face-top coordination, which exists in most
                 clothes-fitting scenarios, is challenging due to
                 varieties of attributes, implicit correlations, and
                 tradeoffs between general preferences and individual
                 preferences. We present a Deep-Based Self-Refined
                 (DBSR) \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2021:DAM,
  author =       "Minxuan Lin and Fan Tang and Weiming Dong and Xiao Li
                 and Changsheng Xu and Chongyang Ma",
  title =        "Distribution Aligned Multimodal and Multi-domain Image
                 Stylization",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "96:1--96:17",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3450525",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3450525",
  abstract =     "Multimodal and multi-domain stylization are two
                 important problems in the field of image style
                 transfer. Currently, there are few methods that can
                 perform multimodal and multi-domain stylization
                 simultaneously. In this study, we propose a unified
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Du:2021:IGS,
  author =       "Yong Du and Yangyang Xu and Taizhong Ye and Qiang Wen
                 and Chufeng Xiao and Junyu Dong and Guoqiang Han and
                 Shengfeng He",
  title =        "Invertible Grayscale with Sparsity Enforcing Priors",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "97:1--97:17",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3451993",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3451993",
  abstract =     "Color dimensionality reduction is believed as a
                 non-invertible process, as re-colorization results in
                 perceptually noticeable and unrecoverable distortion.
                 In this article, we propose to convert a color image
                 into a grayscale image that can fully recover
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qian:2021:KAM,
  author =       "Shengsheng Qian and Jun Hu and Quan Fang and
                 Changsheng Xu",
  title =        "Knowledge-aware Multi-modal Adaptive Graph
                 Convolutional Networks for Fake News Detection",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3",
  pages =        "98:1--98:23",
  month =        aug,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3451215",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Aug 19 08:56:09 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3451215",
  abstract =     "In this article, we focus on fake news detection task
                 and aim to automatically identify the fake news from
                 vast amount of social media posts. To date, many
                 approaches have been proposed to detect fake news,
                 which includes traditional learning methods and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:ISI,
  author =       "Yu-Dong Zhang and Juan Manuel Gorriz and Zhengchao
                 Dong",
  title =        "Introduction to the Special Issue on Explainable Deep
                 Learning for Medical Image Computing",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "99:1--99:2",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485046",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485046",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ni:2021:LCL,
  author =       "Tongguang Ni and Yan Ding and Jing Xue and Kaijian Xia
                 and Xiaoqing Gu and Yizhang Jiang",
  title =        "Local Constraint and Label Embedding Multi-layer
                 Dictionary Learning for Sperm Head Classification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "100:1--100:16",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458927",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458927",
  abstract =     "Morphological classification of human sperm heads is a
                 key technology for diagnosing male infertility. Due to
                 its sparse representation and learning capability,
                 dictionary learning has shown remarkable performance in
                 human sperm head classification. To \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2021:DAC,
  author =       "Bingzhi Chen and Yishu Liu and Zheng Zhang and
                 Yingjian Li and Zhao Zhang and Guangming Lu and
                 Hongbing Yu",
  title =        "Deep Active Context Estimation for Automated
                 {COVID-19} Diagnosis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "101:1--101:22",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457124",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457124",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2021:MIC,
  author =       "Xiangbin Liu and Jiesheng He and Liping Song and Shuai
                 Liu and Gautam Srivastava",
  title =        "Medical Image Classification based on an Adaptive Size
                 Deep Learning Model",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "102:1--102:18",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3465220",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3465220",
  abstract =     "With the rapid development of Artificial Intelligence
                 (AI), deep learning has increasingly become a research
                 hotspot in various fields, such as medical image
                 classification. Traditional deep learning models use
                 Bilinear Interpolation when processing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lu:2021:EFD,
  author =       "Siyuan Lu and Di Wu and Zheng Zhang and Shui-Hua
                 Wang",
  title =        "An Explainable Framework for Diagnosis of {COVID-19}
                 Pneumonia via Transfer Learning and Discriminant
                 Correlation Analysis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "103:1--103:16",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3449785",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3449785",
  abstract =     "The new coronavirus COVID-19 has been spreading all
                 over the world in the last six months, and the death
                 toll is still rising. The accurate diagnosis of
                 COVID-19 is an emergent task as to stop the spreading
                 of the virus. In this paper, we proposed to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alizadehsani:2021:UAS,
  author =       "Roohallah Alizadehsani and Danial Sharifrazi and Navid
                 Hoseini Izadi and Javad Hassannataj Joloudari and
                 Afshin Shoeibi and Juan M. Gorriz and Sadiq Hussain and
                 Juan E. Arco and Zahra Alizadeh Sani and Fahime
                 Khozeimeh and Abbas Khosravi and Saeid Nahavandi and
                 Sheikh Mohammed Shariful Islam and U. Rajendra
                 Acharya",
  title =        "Uncertainty-Aware Semi-Supervised Method Using Large
                 Unlabeled and Limited Labeled {COVID-19} Data",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "104:1--104:24",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462635",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462635",
  abstract =     "The new coronavirus has caused more than one million
                 deaths and continues to spread rapidly. This virus
                 targets the lungs, causing respiratory distress which
                 can be mild or severe. The X-ray or computed tomography
                 (CT) images of lungs can reveal whether \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "104",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kumar:2021:DDE,
  author =       "Ambeshwar Kumar and Ramachandran Manikandan and Utku
                 Kose and Deepak Gupta and Suresh C. Satapathy",
  title =        "Doctor's Dilemma: Evaluating an Explainable
                 Subtractive Spatial Lightweight Convolutional Neural
                 Network for Brain Tumor Diagnosis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "105:1--105:26",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457187",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457187",
  abstract =     "In Medicine Deep Learning has become an essential tool
                 to achieve outstanding diagnosis on image data.
                 However, one critical problem is that Deep Learning
                 comes with complicated, black-box models so it is not
                 possible to analyze their trust level \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "105",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Su:2021:HRP,
  author =       "Ge Su and Bo Lin and Wei Luo and Jianwei Yin and
                 Shuiguang Deng and Honghao Gao and Renjun Xu",
  title =        "Hypomimia Recognition in {Parkinson}'s Disease With
                 Semantic Features",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "106:1--106:20",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3476778",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3476778",
  abstract =     "Parkinson's disease is the second most common
                 neurodegenerative disorder, commonly affecting elderly
                 people over the age of 65. As the cardinal
                 manifestation, hypomimia, referred to as impairments in
                 normal facial expressions, stays covert. Even some
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "106",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xin:2021:WEG,
  author =       "Qi Xin and Shaohao Hu and Shuaiqi Liu and Ling Zhao
                 and Shuihua Wang",
  title =        "{WTRPNet}: an Explainable Graph Feature Convolutional
                 Neural Network for Epileptic {EEG} Classification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "107:1--107:18",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460522",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460522",
  abstract =     "As one of the important tools of epilepsy diagnosis,
                 the electroencephalogram (EEG) is noninvasive and
                 presents no traumatic injury to patients. It contains a
                 lot of physiological and pathological information that
                 is easy to obtain. The automatic \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "107",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cheng:2021:ISI,
  author =       "Wen-Huang Cheng and Jiaying Liu and Nicu Sebe and
                 Junsong Yuan and Hong-Han Shuai",
  title =        "Introduction to the Special Issue on Explainable {AI}
                 on Multimedia Computing",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "108:1--108:2",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3489522",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3489522",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "108",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2021:LFS,
  author =       "Jiguo Li and Xinfeng Zhang and Jizheng Xu and Siwei Ma
                 and Wen Gao",
  title =        "Learning to Fool the Speaker Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "109:1--109:21",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468673",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468673",
  abstract =     "Due to the widespread deployment of
                 fingerprint/face/speaker recognition systems, the risk
                 in these systems, especially the adversarial attack,
                 has drawn increasing attention in recent years.
                 Previous researches mainly studied the adversarial
                 attack to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "109",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2021:PNR,
  author =       "Chenggang Yan and Tong Teng and Yutao Liu and Yongbing
                 Zhang and Haoqian Wang and Xiangyang Ji",
  title =        "Precise No-Reference Image Quality Evaluation Based on
                 Distortion Identification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "110:1--110:21",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468872",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468872",
  abstract =     "The difficulty of no-reference image quality
                 assessment (NR IQA) often lies in the lack of knowledge
                 about the distortion in the image, which makes quality
                 assessment blind and thus inefficient. To tackle such
                 issue, in this article, we propose a novel \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "110",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2021:EAM,
  author =       "Yung-Yao Chen and Sin-Ye Jhong and Chih-Hsien Hsia and
                 Kai-Lung Hua",
  title =        "Explainable {AI}: a Multispectral Palm-Vein
                 Identification System with New Augmentation Features",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "111:1--111:21",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468873",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468873",
  abstract =     "Recently, as one of the most promising biometric
                 traits, the vein has attracted the attention of both
                 academia and industry because of its living body
                 identification and the convenience of the acquisition
                 process. State-of-the-art techniques can provide
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "111",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2021:XEC,
  author =       "Yu-Sheng Lin and Zhe-Yu Liu and Yu-An Chen and
                 Yu-Siang Wang and Ya-Liang Chang and Winston H. Hsu",
  title =        "{xCos}: an Explainable Cosine Metric for Face
                 Verification Task",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "112:1--112:16",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469288",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469288",
  abstract =     "We study the XAI (explainable AI) on the face
                 recognition task, particularly the face verification.
                 Face verification has become a crucial task in recent
                 days and it has been deployed to plenty of
                 applications, such as access control, surveillance, and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "112",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shorfuzzaman:2021:EDL,
  author =       "Mohammad Shorfuzzaman and M. Shamim Hossain and
                 Abdulmotaleb {El Saddik}",
  title =        "An Explainable Deep Learning Ensemble Model for Robust
                 Diagnosis of Diabetic Retinopathy Grading",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "113:1--113:24",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469841",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469841",
  abstract =     "Diabetic retinopathy (DR) is one of the most common
                 causes of vision loss in people who have diabetes for a
                 prolonged period. Convolutional neural networks (CNNs)
                 have become increasingly popular for computer-aided DR
                 diagnosis using retinal fundus \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "113",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2021:BBD,
  author =       "Zhenyu Wu and Zhaowen Wang and Ye Yuan and Jianming
                 Zhang and Zhangyang Wang and Hailin Jin",
  title =        "Black-Box Diagnosis and Calibration on {GAN}
                 Intra-Mode Collapse: a Pilot Study",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "114:1--114:18",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472768",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472768",
  abstract =     "Generative adversarial networks (GANs) nowadays are
                 capable of producing images of incredible realism. Two
                 concerns raised are whether the state-of-the-art GAN's
                 learned distribution still suffers from mode collapse
                 and what to do if so. Existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "114",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xia:2021:SED,
  author =       "Bohui Xia and Xueting Wang and Toshihiko Yamasaki",
  title =        "Semantic Explanation for Deep Neural Networks Using
                 Feature Interactions",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "115:1--115:19",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474557",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474557",
  abstract =     "Given the promising results obtained by deep-learning
                 techniques in multimedia analysis, the explainability
                 of predictions made by networks has become important in
                 practical applications. We present a method to generate
                 semantic and quantitative \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "115",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:LDS,
  author =       "Yang Wang and Yang Cao and Jing Zhang and Feng Wu and
                 Zheng-Jun Zha",
  title =        "Leveraging Deep Statistics for Underwater Image
                 Enhancement",
  journal =      j-TOMM,
  volume =       "17",
  number =       "3s",
  pages =        "116:1--116:20",
  month =        oct,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3489520",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Dec 31 09:04:25 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3489520",
  abstract =     "Underwater imaging often suffers from color cast and
                 contrast degradation due to range-dependent medium
                 absorption and light scattering. Introducing image
                 statistics as prior has been proved to be an effective
                 solution for underwater image enhancement. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "116",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2021:DSG,
  author =       "Junyi Wu and Yan Huang and Qiang Wu and Zhipeng Gao
                 and Jianqiang Zhao and Liqin Huang",
  title =        "Dual-Stream Guided-Learning via a Priori Optimization
                 for Person Re-identification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "117:1--117:22",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447715",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447715",
  abstract =     "The task of person re-identification (re-ID) is to
                 find the same pedestrian across non-overlapping camera
                 views. Generally, the performance of person re-ID can
                 be affected by background clutter. However, existing
                 segmentation algorithms cannot obtain \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{He:2021:ACO,
  author =       "Zhaoliang He and Hongshan Li and Zhi Wang and Shutao
                 Xia and Wenwu Zhu",
  title =        "Adaptive Compression for Online Computer Vision: an
                 Edge Reinforcement Learning Approach",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "118:1--118:23",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447878",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447878",
  abstract =     "With the growth of computer vision-based applications,
                 an explosive amount of images have been uploaded to
                 cloud servers that host such online computer vision
                 algorithms, usually in the form of deep learning
                 models. JPEG has been used as the de facto. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "118",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pan:2021:SDE,
  author =       "Yingwei Pan and Yue Chen and Qian Bao and Ning Zhang
                 and Ting Yao and Jingen Liu and Tao Mei",
  title =        "{Smart Director}: an Event-Driven Directing System for
                 Live Broadcasting",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "119:1--119:18",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3448981",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3448981",
  abstract =     "Live video broadcasting normally requires a multitude
                 of skills and expertise with domain knowledge to enable
                 multi-camera productions. As the number of cameras
                 keeps increasing, directing a live sports broadcast has
                 now become more complicated and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "119",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:DSS,
  author =       "Chunyan Xu and Rong Liu and Tong Zhang and Zhen Cui
                 and Jian Yang and Chunlong Hu",
  title =        "Dual-Stream Structured Graph Convolution Network for
                 Skeleton-Based Action Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "120:1--120:22",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3450410",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3450410",
  abstract =     "In this work, we propose a dual-stream structured
                 graph convolution network (DS-SGCN) to solve the
                 skeleton-based action recognition problem. The
                 spatio-temporal coordinates and appearance contexts of
                 the skeletal joints are jointly integrated into the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "120",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:UDE,
  author =       "Jie Wang and Kaibin Tian and Dayong Ding and Gang Yang
                 and Xirong Li",
  title =        "Unsupervised Domain Expansion for Visual
                 Categorization",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "121:1--121:24",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3448108",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3448108",
  abstract =     "Expanding visual categorization into a novel domain
                 without the need of extra annotation has been a
                 long-term interest for multimedia intelligence.
                 Previously, this challenge has been approached by
                 unsupervised domain adaptation (UDA). Given labeled
                 data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "121",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mawalim:2021:TIR,
  author =       "Candy Olivia Mawalim and Shogo Okada and Yukiko I.
                 Nakano",
  title =        "Task-independent Recognition of Communication Skills
                 in Group Interaction Using Time-series Modeling",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "122:1--122:27",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3450283",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3450283",
  abstract =     "Case studies of group discussions are considered an
                 effective way to assess communication skills (CS). This
                 method can help researchers evaluate participants'
                 engagement with each other in a specific realistic
                 context. In this article, multimodal analysis
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "122",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:WTG,
  author =       "Bo Zhang and Rui Zhang and Niccolo Bisagno and Nicola
                 Conci and Francesco G. B. {De Natale} and Hongbo Liu",
  title =        "Where Are They Going? {Predicting} Human Behaviors in
                 Crowded Scenes",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "123:1--123:19",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3449359",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3449359",
  abstract =     "In this article, we propose a framework for crowd
                 behavior prediction in complicated scenarios. The
                 fundamental framework is designed using the standard
                 encoder-decoder scheme, which is built upon the long
                 short-term memory module to capture the temporal
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "123",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Silva:2021:UMC,
  author =       "Ellen P. Silva and Nat{\'a}lia Vieira and Glauco
                 Amorim and Renata Mousinho and Gustavo Guedes and
                 Gheorghita Ghinea and Joel A. F. {Dos Santos}",
  title =        "Using Multisensory Content to Impact the Quality of
                 Experience of Reading Digital Books",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "124:1--124:18",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458676",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458676",
  abstract =     "Multisensorial books enrich a story with either
                 traditional multimedia content or sensorial effects.
                 The main idea is to increase children's interest in
                 reading by enhancing their QoE while reading. Studies
                 on enriched and/or augmented e-books also \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "124",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jiang:2021:BDC,
  author =       "Weitao Jiang and Weixuan Wang and Haifeng Hu",
  title =        "Bi-Directional Co-Attention Network for Image
                 Captioning",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "125:1--125:20",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460474",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460474",
  abstract =     "Image Captioning, which automatically describes an
                 image with natural language, is regarded as a
                 fundamental challenge in computer vision. In recent
                 years, significant advance has been made in image
                 captioning through improving attention mechanism.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "125",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shen:2021:CDO,
  author =       "Xiangjun Shen and Jinghui Zhou and Zhongchen Ma and
                 Bingkun Bao and Zhengjun Zha",
  title =        "Cross-Domain Object Representation via Robust Low-Rank
                 Correlation Analysis",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "126:1--126:20",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458825",
  abstract =     "Cross-domain data has become very popular recently
                 since various viewpoints and different sensors tend to
                 facilitate better data representation. In this article,
                 we propose a novel cross-domain object representation
                 algorithm (RLRCA) which not only \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "126",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2021:CMH,
  author =       "Xing Xu and Yifan Wang and Yixuan He and Yang Yang and
                 Alan Hanjalic and Heng Tao Shen",
  title =        "Cross-Modal Hybrid Feature Fusion for Image-Sentence
                 Matching",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "127:1--127:23",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458281",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458281",
  abstract =     "Image-sentence matching is a challenging task in the
                 field of language and vision, which aims at measuring
                 the similarities between images and sentence
                 descriptions. Most existing methods independently map
                 the global features of images and sentences into
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Messina:2021:FGV,
  author =       "Nicola Messina and Giuseppe Amato and Andrea Esuli and
                 Fabrizio Falchi and Claudio Gennaro and St{\'e}phane
                 Marchand-Maillet",
  title =        "Fine-Grained Visual Textual Alignment for Cross-Modal
                 Retrieval Using Transformer Encoders",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "128:1--128:23",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3451390",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3451390",
  abstract =     "Despite the evolution of deep-learning-based
                 visual-textual processing systems, precise multi-modal
                 matching remains a challenging task. In this work, we
                 tackle the task of cross-modal retrieval through
                 image-sentence matching based on word-region \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "128",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ma:2021:HSP,
  author =       "Xuan Ma and Xiaoshan Yang and Junyu Gao and Changsheng
                 Xu",
  title =        "Health Status Prediction with Local-Global
                 Heterogeneous Behavior Graph",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "129:1--129:21",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457893",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457893",
  abstract =     "Health management is getting increasing attention all
                 over the world. However, existing health management
                 mainly relies on hospital examination and treatment,
                 which are complicated and untimely. The emergence of
                 mobile devices provides the possibility to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "129",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhai:2021:PQA,
  author =       "Guangtao Zhai and Wei Sun and Xiongkuo Min and Jiantao
                 Zhou",
  title =        "Perceptual Quality Assessment of Low-light Image
                 Enhancement",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "130:1--130:24",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457905",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457905",
  abstract =     "Low-light image enhancement algorithms (LIEA) can
                 light up images captured in dark or back-lighting
                 conditions. However, LIEA may introduce various
                 distortions such as structure damage, color shift, and
                 noise into the enhanced images. Despite various
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "130",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mishra:2021:DBR,
  author =       "Prerna Mishra and Santosh Kumar and Mithilesh Kumar
                 Chaube",
  title =        "Dissimilarity-Based Regularized Learning of Charts",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "131:1--131:23",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458884",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458884",
  abstract =     "Chart images exhibit significant variabilities that
                 make each image different from others even though they
                 belong to the same class or categories. Classification
                 of charts is a major challenge because each chart class
                 has variations in features, structure,. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "131",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nandanwar:2021:NFB,
  author =       "Lokesh Nandanwar and Palaiahnakote Shivakumara and
                 Divya Krishnani and Raghavendra Ramachandra and Tong Lu
                 and Umapada Pal and Mohan Kankanhalli",
  title =        "A New Foreground-Background based Method for
                 Behavior-Oriented Social Media Image Classification",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "132:1--132:25",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458051",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458051",
  abstract =     "Due to various applications, research on personal
                 traits using information on social media has become an
                 important area. In this paper, a new method for the
                 classification of behavior-oriented social images
                 uploaded on various social media platforms is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "132",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alahmadi:2021:ABS,
  author =       "Mohannad Alahmadi and Peter Pocta and Hugh Melvin",
  title =        "An Adaptive Bitrate Switching Algorithm for Speech
                 Applications in Context of {WebRTC}",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "133:1--133:21",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458751",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458751",
  abstract =     "Web Real-Time Communication (WebRTC) combines a set of
                 standards and technologies to enable high-quality
                 audio, video, and auxiliary data exchange in web
                 browsers and mobile applications. It enables
                 peer-to-peer multimedia sessions over IP networks
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2021:FVS,
  author =       "Wei Gao and Linjie Zhou and Lvfang Tao",
  title =        "A Fast View Synthesis Implementation Method for Light
                 Field Applications",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "134:1--134:20",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3459098",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3459098",
  abstract =     "View synthesis (VS) for light field images is a very
                 time-consuming task due to the great quantity of
                 involved pixels and intensive computations, which may
                 prevent it from the practical three-dimensional
                 real-time systems. In this article, we propose an
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "134",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2021:BCR,
  author =       "Jianhai Zhang and Zhiyong Feng and Yong Su and Meng
                 Xing",
  title =        "{Bayesian} Covariance Representation with Global
                 Informative Prior for {$3$D} Action Recognition",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "135:1--135:22",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460235",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460235",
  abstract =     "For the merits of high-order statistics and Riemannian
                 geometry, covariance matrix has become a generic
                 feature representation for action recognition. An
                 independent action can be represented by an empirical
                 statistics over all of its pose samples. Two \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "135",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2021:PAP,
  author =       "Anqi Zhu and Lin Zhang and Juntao Chen and Yicong
                 Zhou",
  title =        "Pedestrian-Aware Panoramic Video Stitching Based on a
                 Structured Camera Array",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "136:1--136:24",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460511",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460511",
  abstract =     "The panorama stitching system is an indispensable
                 module in surveillance or space exploration. Such a
                 system enables the viewer to understand the
                 surroundings instantly by aligning the surrounding
                 images on a plane and fusing them naturally. The
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "136",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2021:NDB,
  author =       "Yizhen Chen and Haifeng Hu",
  title =        "{Y-Net}: Dual-branch Joint Network for Semantic
                 Segmentation",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "137:1--137:22",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460940",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460940",
  abstract =     "Most existing segmentation networks are built upon a
                 ``U-shaped'' encoder-decoder structure, where the
                 multi-level features extracted by the encoder are
                 gradually aggregated by the decoder. Although this
                 structure has been proven to be effective in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "137",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2021:DNA,
  author =       "Jinwei Wang and Wei Huang and Xiangyang Luo and
                 Yun-Qing Shi and Sunil Kr. Jha",
  title =        "Detecting Non-Aligned Double {JPEG} Compression Based
                 on Amplitude-Angle Feature",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "138:1--138:18",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3464388",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3464388",
  abstract =     "Due to the popularity of JPEG format images in recent
                 years, JPEG images will inevitably involve image
                 editing operation. Thus, some tramped images will leave
                 tracks of Non-aligned double JPEG (NA-DJPEG)
                 compression. By detecting the presence of NA-DJPEG
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "138",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jia:2021:RGL,
  author =       "Wei Jia and Li Li and Zhu Li and Xiang Zhang and Shan
                 Liu",
  title =        "Residual-guided In-loop Filter Using Convolution
                 Neural Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "139:1--139:19",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460820",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460820",
  abstract =     "The block-based coding structure in the hybrid video
                 coding framework inevitably introduces compression
                 artifacts such as blocking, ringing, and so on. To
                 compensate for those artifacts, extensive filtering
                 techniques were proposed in the loop of video
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "139",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2021:TMF,
  author =       "Zhihan Lv and Houbing Song",
  title =        "Trust Mechanism of Feedback Trust Weight in Multimedia
                 Network",
  journal =      j-TOMM,
  volume =       "17",
  number =       "4",
  pages =        "140:1--140:26",
  month =        nov,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391296",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Jan 14 07:01:30 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3391296",
  abstract =     "It is necessary to solve the inaccurate data arising
                 from data reliability ignored by most data fusion
                 algorithms drawing upon collaborative filtering and
                 fuzzy network theory. Therefore, a model is constructed
                 based on the collaborative filtering \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "140",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yao:2022:SLM,
  author =       "Peng Yao and Jieqing Feng",
  title =        "Sparse {LIDAR} Measurement Fusion with Joint Updating
                 Cost for Fast Stereo Matching",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "1:1--1:18",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471870",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471870",
  abstract =     "The complementary virtues of active and passive depth
                 sensors inspire the LIDAR-Stereo fusion for enhancing
                 the accuracy of stereo matching. However, most of the
                 fusion based stereo matching algorithms have exploited
                 dense LIDAR priors with single fusion \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Karagkioules:2022:OLA,
  author =       "Theodoros Karagkioules and Georgios S. Paschos and
                 Nikolaos Liakopoulos and Attilio Fiandrotti and
                 Dimitrios Tsilimantos and Marco Cagnazzo",
  title =        "Online Learning for Adaptive Video Streaming in Mobile
                 Networks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "2:1--2:22",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460819",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460819",
  abstract =     "In this paper, we propose a novel algorithm for video
                 bitrate adaptation in HTTP Adaptive Streaming (HAS),
                 based on online learning. The proposed algorithm, named
                 Learn2Adapt (L2A), is shown to provide a robust bitrate
                 adaptation strategy which, unlike \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fan:2022:MUE,
  author =       "Ching-Ling Fan and Tse-Hou Hung and Cheng-Hsin Hsu",
  title =        "Modeling the User Experience of Watching 360${}^\circ
                 $ Videos with Head-Mounted Displays",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "3:1--3:23",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3463825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3463825",
  abstract =     "Conducting user studies to quantify the Quality of
                 Experience (QoE) of watching the increasingly more
                 popular 360${}^\circ $ videos in Head-Mounted Displays
                 (HMDs) is time-consuming, tedious, and expensive.
                 Deriving QoE models, however, is very challenging
                 because \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{S:2022:TRL,
  author =       "Baiju P. S. and Sudhish N. George",
  title =        "{TTV} Regularized {LRTA} Technique for the Estimation
                 of Haze Model Parameters in Video Dehazing",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "4:1--4:22",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3465454",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3465454",
  abstract =     "Nowadays, intelligent transport systems have a major
                 role in providing a safe and secure traffic society for
                 passengers, pedestrians, and vehicles. However, some
                 bad weather conditions such as haze or fog may affect
                 the visual clarity of video footage \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Aloufi:2022:MDT,
  author =       "Samah Aloufi and Abdulmotaleb {El Saddik}",
  title =        "{MMSUM} Digital Twins: a Multi-view Multi-modality
                 Summarization Framework for Sporting Events",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "5:1--5:25",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462777",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462777",
  abstract =     "Sporting events generate a massive amount of traffic
                 on social media with live moment-to-moment accounts as
                 any given situation unfolds. The generated data are
                 intensified by fans feelings, reactions, and subjective
                 opinions towards what happens during \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:MFF,
  author =       "Zhoutao Wang and Qian Xie and Mingqiang Wei and Kun
                 Long and Jun Wang",
  title =        "Multi-feature Fusion {VoteNet} for {$3$D} Object
                 Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "6:1--6:17",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462219",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462219",
  abstract =     "In this article, we propose a Multi-feature Fusion
                 VoteNet (MFFVoteNet) framework for improving the 3D
                 object detection performance in cluttered and heavily
                 occluded scenes. Our method takes the point cloud and
                 the synchronized RGB image as inputs to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Uddin:2022:NMM,
  author =       "Md Azher Uddin and Joolekha Bibi Joolee and Young-Koo
                 Lee and Kyung-Ah Sohn",
  title =        "A Novel Multi-Modal Network-Based Dynamic Scene
                 Understanding",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "7:1--7:19",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462218",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462218",
  abstract =     "In recent years, dynamic scene understanding has
                 gained attention from researchers because of its
                 widespread applications. The main important factor in
                 successfully understanding the dynamic scenes lies in
                 jointly representing the appearance and motion
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:FEA,
  author =       "Shiguang Liu and Huixin Wang and Min Pei",
  title =        "Facial-expression-aware Emotional Color Transfer Based
                 on Convolutional Neural Network",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "8:1--8:19",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3464382",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3464382",
  abstract =     "Emotional color transfer aims to change the evoked
                 emotion of a source image to that of a target image by
                 adjusting color distribution. Most of existing
                 emotional color transfer methods only consider the
                 low-level visual features of an image and ignore
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{PeresRebelo:2022:IAI,
  author =       "Ana Daniela {Peres Rebelo} and Guedes {De Oliveira
                 In{\^e}s} and D. E. Verboom Damion",
  title =        "The Impact of Artificial Intelligence on the
                 Creativity of Videos",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "9:1--9:27",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462634",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462634",
  abstract =     "This study explored the impact Artificial Intelligence
                 (AI) has on the evaluation of creative elements in
                 artistic videos. The aim was to verify to what extent
                 the use of an AI algorithm (Style Transfer) contributes
                 to changes in the perceived creativity \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Song:2022:LHV,
  author =       "Yaguang Song and Junyu Gao and Xiaoshan Yang and
                 Changsheng Xu",
  title =        "Learning Hierarchical Video Graph Networks for
                 One-Stop Video Delivery",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "10:1--10:23",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3466886",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3466886",
  abstract =     "The explosive growth of video data has brought great
                 challenges to video retrieval, which aims to find out
                 related videos from a video collection. Most users are
                 usually not interested in all the content of retrieved
                 videos but have a more fine-grained \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mao:2022:MGD,
  author =       "Aihua Mao and Yuan Liang and Jianbo Jiao and Yongtuo
                 Liu and Shengfeng He",
  title =        "Mask-Guided Deformation Adaptive Network for Human
                 Parsing",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "11:1--11:20",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3467889",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3467889",
  abstract =     "Due to the challenges of densely compacted body parts,
                 nonrigid clothing items, and severe overlap in crowd
                 scenes, human parsing needs to focus more on multilevel
                 feature representations compared to general scene
                 parsing tasks. Based on this observation, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tiotsop:2022:MIM,
  author =       "Lohic Fotio Tiotsop and Tomas Mizdos and Marcus
                 Barkowsky and Peter Pocta and Antonio Servetti and
                 Enrico Masala",
  title =        "Mimicking Individual Media Quality Perception with
                 Neural Network based Artificial Observers",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "12:1--12:25",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3464393",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3464393",
  abstract =     "The media quality assessment research community has
                 traditionally been focusing on developing objective
                 algorithms to predict the result of a typical
                 subjective experiment in terms of Mean Opinion Score
                 (MOS) value. However, the MOS, being a single value,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Thong:2022:DSV,
  author =       "William Thong and Cees G. M. Snoek",
  title =        "Diversely-Supervised Visual Product Search",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "13:1--13:22",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3461646",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3461646",
  abstract =     "This article strives for a diversely supervised visual
                 product search, where queries specify a diverse set of
                 labels to search for. Where previous works have focused
                 on representing attribute, instance, or category labels
                 individually, we consider them \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Farhat:2022:CCC,
  author =       "Farshid Farhat and Mohammad Mahdi Kamani and James Z.
                 Wang",
  title =        "{CAPTAIN}: Comprehensive Composition Assistance for
                 Photo Taking",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "14:1--14:24",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462762",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462762",
  abstract =     "Many people are interested in taking astonishing
                 photos and sharing them with others. Emerging high-tech
                 hardware and software facilitate the ubiquitousness and
                 functionality of digital photography. Because
                 composition matters in photography, researchers
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Holloman:2022:DSS,
  author =       "Amanda K. Holloman and Chris S. Crawford",
  title =        "Defining Scents: a Systematic Literature Review of
                 Olfactory-based Computing Systems",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "15:1--15:22",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470975",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470975",
  abstract =     "The human sense of smell is a primal ability that has
                 the potential to reveal unexplored relationships
                 between user behaviors and technology. Humans use
                 millions of olfactory receptor cells to observe the
                 environment around them. Olfaction studies are
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2022:HIR,
  author =       "Xian-Hua Han and Yinqiang Zheng and Yen-Wei Chen",
  title =        "Hyperspectral Image Reconstruction Using Multi-scale
                 Fusion Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "16:1--16:21",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477396",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477396",
  abstract =     "Hyperspectral imaging is a promising imaging modality
                 that simultaneously captures several images for the
                 same scene on narrow spectral bands, and it has made
                 considerable progress in different fields, such as
                 agriculture, astronomy, and surveillance. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tasaka:2022:EMC,
  author =       "Shuji Tasaka",
  title =        "An Empirical Method for Causal Inference of Constructs
                 for {QoE} in Haptic-Audiovisual Communications",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "17:1--17:24",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473986",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473986",
  abstract =     "This article proposes an empirical method for
                 inferring causal directions in multidimensional Quality
                 of Experience (QoE) in multimedia communications,
                 noting that causation in QoE is perceptual. As an
                 example for modeling framework, we pick up a Bayesian
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2022:RIT,
  author =       "Dongbao Yang and Yu Zhou and Wei Shi and Dayan Wu and
                 Weiping Wang",
  title =        "{RD-IOD}: Two-Level Residual-Distillation-Based
                 Triple-Network for Incremental Object Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "18:1--18:23",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472393",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472393",
  abstract =     "As a basic component in multimedia applications,
                 object detectors are generally trained on a fixed set
                 of classes that are pre-defined. However, new object
                 classes often emerge after the models are trained in
                 practice. Modern object detectors based on \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hsu:2022:OIV,
  author =       "Chih-Fan Hsu and Tse-Hou Hung and Cheng-Hsin Hsu",
  title =        "Optimizing Immersive Video Coding Configurations Using
                 Deep Learning: a Case Study on {TMIV}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "19:1--19:25",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471191",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471191",
  abstract =     "Immersive video streaming technologies improve Virtual
                 Reality (VR) user experience by providing users more
                 intuitive ways to move in simulated worlds, e.g., with
                 6 Degree-of-Freedom (6DoF) interaction mode. A naive
                 method to achieve 6DoF is deploying \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Siegfried:2022:RUG,
  author =       "R{\'e}my Siegfried and Jean-Marc Odobez",
  title =        "Robust Unsupervised Gaze Calibration Using
                 Conversation and Manipulation Attention Priors",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "20:1--20:27",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472622",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472622",
  abstract =     "Gaze estimation is a difficult task, even for humans.
                 However, as humans, we are good at understanding a
                 situation and exploiting it to guess the expected
                 visual focus of attention of people, and we usually use
                 this information to retrieve people's gaze. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:LLS,
  author =       "Jing Wang and Weiqing Min and Sujuan Hou and Shengnan
                 Ma and Yuanjie Zheng and Shuqiang Jiang",
  title =        "{LogoDet-3K}: a Large-scale Image Dataset for Logo
                 Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "21:1--21:19",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3466780",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3466780",
  abstract =     "Logo detection has been gaining considerable attention
                 because of its wide range of applications in the
                 multimedia field, such as copyright infringement
                 detection, brand visibility monitoring, and product
                 brand management on social media. In this article,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:ALC,
  author =       "Da-Chun Wu and Yu-Tsung Hsu",
  title =        "Authentication of {LINE} Chat History Files by
                 Information Hiding",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "22:1--22:23",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474225",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474225",
  abstract =     "With the prevalence of smartphones, message exchanges
                 via mobile chatting programs like LINE have become
                 popular. The messages in the form of chat records in a
                 LINE chat history, after being downloaded for legal
                 uses, might be tampered with illicitly. A \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:PPM,
  author =       "Changming Liu and Xiaojing Ma and Sixing Cao and
                 Jiayun Fu and Bin B. Zhu",
  title =        "Privacy-preserving Motion Detection for
                 {HEVC}-compressed Surveillance Video",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1",
  pages =        "23:1--23:27",
  month =        jan,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472669",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:22:44 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472669",
  abstract =     "In the cloud era, a large amount of data is uploaded
                 to and processed by public clouds. The risk of privacy
                 leakage has become a major concern for cloud users.
                 Cloud-based video surveillance requires motion
                 detection, which may reveal the privacy of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:ISIa,
  author =       "Shiliang Zhang and Guorong Li and Weigang Zhang and
                 Qingming Huang and Tiejun Huang and Mubarak Shah and
                 Nicu Sebe",
  title =        "Introduction to the Special Issue on Fine-Grained
                 Visual Recognition and Re-Identification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "24:1--24:3",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505280",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505280",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:HMM,
  author =       "La Zhang and Haiyun Guo and Kuan Zhu and Honglin Qiao
                 and Gaopan Huang and Sen Zhang and Huichen Zhang and
                 Jian Sun and Jinqiao Wang",
  title =        "Hybrid Modality Metric Learning for Visible-Infrared
                 Person Re-Identification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "25:1--25:15",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473341",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473341",
  abstract =     "Visible-infrared person re-identification (Re-ID) has
                 received increasing research attention for its great
                 practical value in night-time surveillance scenarios.
                 Due to the large variations in person pose, viewpoint,
                 and occlusion in the same modality, as \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2022:BIB,
  author =       "Sheng Xu and Chang Liu and Baochang Zhang and Jinhu
                 L{\"u} and Guodong Guo and David Doermann",
  title =        "{BiRe-ID}: Binary Neural Network for Efficient Person
                 Re-{ID}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "26:1--26:22",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473340",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473340",
  abstract =     "Person re-identification (Re-ID) has been promoted by
                 the significant success of convolutional neural
                 networks (CNNs). However, the application of such
                 CNN-based Re-ID methods depends on the tremendous
                 consumption of computation and memory resources,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2022:JGF,
  author =       "Zhongwei Zhao and Ran Song and Qian Zhang and Peng
                 Duan and Youmei Zhang",
  title =        "{JoT-GAN}: a Framework for Jointly Training {GAN} and
                 Person Re-Identification Model",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "27:1--27:18",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491225",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491225",
  abstract =     "To cope with the problem caused by inadequate training
                 data, many person re-identification (re-id) methods
                 exploit generative adversarial networks (GAN) for data
                 augmentation, where the training of GAN is typically
                 independent of that of the re-id model. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2022:SCP,
  author =       "Liqian Liang and Congyan Lang and Zun Li and Jian Zhao
                 and Tao Wang and Songhe Feng",
  title =        "Seeing Crucial Parts: Vehicle Model Verification via a
                 Discriminative Representation Model",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "28:1--28:22",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474596",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474596",
  abstract =     "Widely used surveillance cameras have promoted large
                 amounts of street scene data, which contains one
                 important but long-neglected object: the vehicle. Here
                 we focus on the challenging problem of vehicle model
                 verification. Most previous works usually \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2022:AIF,
  author =       "Chenggang Yan and Lixuan Meng and Liang Li and Jiehua
                 Zhang and Zhan Wang and Jian Yin and Jiyong Zhang and
                 Yaoqi Sun and Bolun Zheng",
  title =        "Age-Invariant Face Recognition by Multi-Feature
                 Fusionand Decomposition with Self-attention",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "29:1--29:18",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472810",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472810",
  abstract =     "Different from general face recognition, age-invariant
                 face recognition (AIFR) aims at matching faces with a
                 big age gap. Previous discriminative methods usually
                 focus on decomposing facial feature into age-related
                 and age-invariant components, which \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhai:2022:RML,
  author =       "Deming Zhai and Ruifeng Shi and Junjun Jiang and
                 Xianming Liu",
  title =        "Rectified Meta-learning from Noisy Labels for Robust
                 Image-based Plant Disease Classification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "30:1--30:17",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472809",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472809",
  abstract =     "Plant diseases serve as one of main threats to food
                 security and crop production. It is thus valuable to
                 exploit recent advances of artificial intelligence to
                 assist plant disease diagnosis. One popular approach is
                 to transform this problem as a leaf \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tan:2022:FGI,
  author =       "Min Tan and Fu Yuan and Jun Yu and Guijun Wang and
                 Xiaoling Gu",
  title =        "Fine-grained Image Classification via Multi-scale
                 Selective Hierarchical Biquadratic Pooling",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "31:1--31:23",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3492221",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3492221",
  abstract =     "How to extract distinctive features greatly challenges
                 the fine-grained image classification tasks. In
                 previous models, bilinear pooling has been frequently
                 adopted to address this problem. However, most bilinear
                 pooling models neglect either intra or \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cucchiara:2022:FGH,
  author =       "Rita Cucchiara and Matteo Fabbri",
  title =        "Fine-grained Human Analysis under Occlusions and
                 Perspective Constraints in Multimedia Surveillance",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "32:1--32:23",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3476839",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3476839",
  abstract =     "Human detection in the wild is a research topic of
                 paramount importance in computer vision, and it is the
                 starting step for designing intelligent systems
                 oriented to human interaction that work in complete
                 autonomy. To achieve this goal, computer vision
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:ICG,
  author =       "Lei Wu and Hefei Ling and Yuxuan Shi and Baiyan
                 Zhang",
  title =        "Instance Correlation Graph for Unsupervised Domain
                 Adaptation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "33:1--33:23",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3486251",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3486251",
  abstract =     "In recent years, deep neural networks have emerged as
                 a dominant machine learning tool for a wide variety of
                 application fields. Due to the expensive cost of manual
                 labeling efforts, it is important to transfer knowledge
                 from a label-rich source domain to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mugnai:2022:FGA,
  author =       "Daniele Mugnai and Federico Pernici and Francesco
                 Turchini and Alberto {Del Bimbo}",
  title =        "Fine-Grained Adversarial Semi-Supervised Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "34:1--34:19",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485473",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485473",
  abstract =     "In this article, we exploit Semi-Supervised Learning
                 (SSL) to increase the amount of training data to
                 improve the performance of Fine-Grained Visual
                 Categorization (FGVC). This problem has not been
                 investigated in the past in spite of prohibitive
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Luo:2022:ERU,
  author =       "Dezhao Luo and Yu Zhou and Bo Fang and Yucan Zhou and
                 Dayan Wu and Weiping Wang",
  title =        "Exploring Relations in Untrimmed Videos for
                 Self-Supervised Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "35:1--35:21",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473342",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473342",
  abstract =     "Existing video self-supervised learning methods mainly
                 rely on trimmed videos for model training. They apply
                 their methods and verify the effectiveness on trimmed
                 video datasets including UCF101 and Kinetics-400, among
                 others. However, trimmed datasets \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:EEC,
  author =       "Yabin Wang and Zhiheng Ma and Xing Wei and Shuai Zheng
                 and Yaowei Wang and Xiaopeng Hong",
  title =        "{ECCNAS}: Efficient Crowd Counting Neural Architecture
                 Search",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "36:1--36:19",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3465455",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3465455",
  abstract =     "Recent solutions to crowd counting problems have
                 already achieved promising performance across various
                 benchmarks. However, applying these approaches to
                 real-world applications is still challenging, because
                 they are computation intensive and lack the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:CFH,
  author =       "Wenxu Li and Gang Pan and Chen Wang and Zhen Xing and
                 Zhenjun Han",
  title =        "From Coarse to Fine: Hierarchical Structure-aware
                 Video Summarization",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "37:1--37:16",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485472",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485472",
  abstract =     "Hierarchical structure is a common characteristic for
                 some kinds of videos (e.g., sports videos, game
                 videos): The videos are composed of several actions
                 hierarchically and there exist temporal dependencies
                 among segments with different scales, where \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hossain:2022:SSA,
  author =       "M. Shamim Hossain and Rita Cucchiara and Ghulam
                 Muhammad and Diana P. Tob{\'o}n and Abdulmotaleb {El
                 Saddik}",
  title =        "Special Section on {AI-empowered} Multimedia Data
                 Analytics for Smart Healthcare",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "38:1--38:2",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505281",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505281",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2022:MFT,
  author =       "Min Chen and Wenjing Xiao and Miao Li and Yixue Hao
                 and Long Hu and Guangming Tao",
  title =        "A Multi-feature and Time-aware-based Stress Evaluation
                 Mechanism for Mental Status Adjustment",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "39:1--39:18",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3462763",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3462763",
  abstract =     "With the rapid economic development, the prominent
                 social competition has led to increasing psychological
                 pressure of people felt from each aspect of life.
                 Driven by the Internet of Things and artificial
                 intelligence, intelligent psychological pressure
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Masud:2022:CNN,
  author =       "Mehedi Masud and Mohammed F. Alhamid and Yin Zhang",
  title =        "A Convolutional Neural Network Model Using Weighted
                 Loss Function to Detect Diabetic Retinopathy",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "40:1--40:16",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470976",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470976",
  abstract =     "Nowadays, artificial intelligence (AI) provides
                 tremendous prospects for driving future healthcare
                 while empowering patients and service providers. The
                 extensive use of digital healthcare produces a massive
                 amount of multimedia healthcare data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:TTM,
  author =       "Debin Liu and Laurence T. Yang and Puming Wang and
                 Ruonan Zhao and Qingchen Zhang",
  title =        "{TT-TSVD}: a Multi-modal Tensor Train Decomposition
                 with Its Application in Convolutional Neural Networks
                 for Smart Healthcare",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "41:1--41:17",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491223",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491223",
  abstract =     "Smart healthcare systems are generating a large scale
                 of heterogeneous high-dimensional data with complex
                 relationships. It is hard for current methods to
                 analyze such high-dimensional healthcare data.
                 Specifically, the traditional data reduction methods
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2022:MNM,
  author =       "Chun-Wei Yang and Thanh Hai Phung and Hong-Han Shuai
                 and Wen-Huang Cheng",
  title =        "Mask or Non-Mask? {Robust} Face Mask Detector via
                 Triplet-Consistency Representation Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "42:1--42:20",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472623",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472623",
  abstract =     "In the absence of vaccines or medicines to stop
                 COVID-19, one of the effective methods to slow the
                 spread of the coronavirus and reduce the overloading of
                 healthcare is to wear a face mask. Nevertheless, to
                 mandate the use of face masks or coverings in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2022:DLB,
  author =       "Zhihan Lv and Zengchen Yu and Shuxuan Xie and Atif
                 Alamri",
  title =        "Deep Learning-based Smart Predictive Evaluation for
                 Interactive Multimedia-enabled Smart Healthcare",
  journal =      j-TOMM,
  volume =       "18",
  number =       "1s",
  pages =        "43:1--43:20",
  month =        feb,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468506",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468506",
  abstract =     "Two-dimensional arrays of bi-component structures made
                 of cobalt and permalloy elliptical dots with thickness
                 of 25 nm, length 1 mm and width of 225 nm, have been
                 prepared by a self-aligned shadow deposition technique.
                 Brillouin light scattering has been \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Amirpour:2022:ELF,
  author =       "Hadi Amirpour and Antonio Pinheiro and Manuela Pereira
                 and Fernando J. P. Lopes and Mohammad Ghanbari",
  title =        "Efficient Light Field Image Compression with Enhanced
                 Random Access",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "44:1--44:18",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471905",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471905",
  abstract =     "In light field image compression, facilitating random
                 access to individual views plays a significant role in
                 decoding views quickly, reducing memory footprint, and
                 decreasing the bandwidth requirement for transmission.
                 Highly efficient light field image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Morillo:2022:EIP,
  author =       "Pedro Morillo and Jos{\'e} J. Navarro-P{\'e}rez and
                 Juan M. Ordu{\~n}a and Marcos Fern{\'a}ndez",
  title =        "Evaluation of an Intervention Program Based on Mobile
                 Apps to Learn Sexism Prevention in Teenagers",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "45:1--45:20",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3471139",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3471139",
  abstract =     "The fight against sexism is nowadays one of the
                 flagship social movements in western countries.
                 Adolescence is a crucial period, and some empirical
                 studies have focused on the socialization of teenagers,
                 proving that the socialization with the surrounding
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2022:LTS,
  author =       "Yansong Tang and Xingyu Liu and Xumin Yu and Danyang
                 Zhang and Jiwen Lu and Jie Zhou",
  title =        "Learning from Temporal Spatial Cubism for
                 Cross-Dataset Skeleton-based Action Recognition",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "46:1--46:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472722",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472722",
  abstract =     "Rapid progress and superior performance have been
                 achieved for skeleton-based action recognition
                 recently. In this article, we investigate this problem
                 under a cross-dataset setting, which is a new,
                 pragmatic, and challenging task in real-world
                 scenarios. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kizilkaya:2022:EFF,
  author =       "Burak Kizilkaya and Enver Ever and Hakan Yekta Yatbaz
                 and Adnan Yazici",
  title =        "An Effective Forest Fire Detection Framework Using
                 Heterogeneous Wireless Multimedia Sensor Networks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "47:1--47:21",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473037",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473037",
  abstract =     "With improvements in the area of Internet of Things
                 (IoT), surveillance systems have recently become more
                 accessible. At the same time, optimizing the energy
                 requirements of smart sensors, especially for data
                 transmission, has always been very important \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:UEU,
  author =       "Yehao Li and Jiahao Fan and Yingwei Pan and Ting Yao
                 and Weiyao Lin and Tao Mei",
  title =        "{Uni-EDEN}: Universal Encoder-Decoder Network by
                 Multi-Granular Vision-Language Pre-training",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "48:1--48:16",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473140",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473140",
  abstract =     "Vision-language pre-training has been an emerging and
                 fast-developing research topic, which transfers
                 multi-modal knowledge from rich-resource pre-training
                 task to limited-resource downstream tasks. Unlike
                 existing works that predominantly learn a single
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2022:CSL,
  author =       "Shenming Feng and Xingzhong Nong and Haifeng Hu",
  title =        "Cascaded Structure-Learning Network with Using
                 Adversarial Training for Robust Facial Landmark
                 Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "49:1--49:20",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474595",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474595",
  abstract =     "Recently, great progress has been achieved on facial
                 landmark detection based on convolutional neural
                 network, while it is still challenging due to partial
                 occlusion and extreme head pose. In this paper, we
                 propose a Cascaded Structure-Learning Network
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Damme:2022:MLB,
  author =       "Sam {Van Damme} and Maria {Torres Vega} and Filip {De
                 Turck}",
  title =        "Machine Learning Based Content-Agnostic Viewport
                 Prediction for 360-Degree Video",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "50:1--50:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474833",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474833",
  abstract =     "Accurate and fast estimations or predictions of the
                 (near) future location of the users of head-mounted
                 devices within the virtual omnidirectional environment
                 open a plethora of opportunities in application domains
                 such as interactive immersive gaming and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yeh:2022:GVW,
  author =       "Chih-Kuo Yeh and Thi-Ngoc-Hanh Le and Zhi-Ying Hou and
                 Tong-Yee Lee",
  title =        "Generating Virtual Wire Sculptural Art from {$3$D}
                 Models",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "51:1--51:23",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3475798",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3475798",
  abstract =     "Wire sculptures are objects sculpted by the use of
                 wires. In this article, we propose practical methods to
                 create 3D virtual wire sculptural art from a given 3D
                 model. In contrast, most of the previous 3D wire art
                 results are reconstructed from input 2D \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2022:RGJ,
  author =       "Teng Sun and Chun Wang and Xuemeng Song and Fuli Feng
                 and Liqiang Nie",
  title =        "Response Generation by Jointly Modeling Personalized
                 Linguistic Styles and Emotions",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "52:1--52:20",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3475872",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3475872",
  abstract =     "Natural language generation (NLG) has been an
                 essential technique for various applications, like
                 XiaoIce and Siri, and engaged increasing attention
                 recently. To improve the user experience, several
                 emotion-aware NLG methods have been developed to
                 generate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Francis:2022:GRS,
  author =       "Jobin Francis and M. Baburaj and Sudhish N. George",
  title =        "An $ l_{1 / 2} $ and Graph Regularized Subspace
                 Clustering Method for Robust Image Segmentation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "53:1--53:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3476514",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3476514",
  abstract =     "Segmenting meaningful visual structures from an image
                 is a fundamental and most-addressed problem in image
                 analysis algorithms. However, among factors such as
                 diverse visual patterns, noise, complex backgrounds,
                 and similar textures present in foreground \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:WYE,
  author =       "Jiahao Wang and Yunhong Wang and Nina Weng and Tianrui
                 Chai and Annan Li and Faxi Zhang and Samsi Yu",
  title =        "Will You Ever Become Popular? {Learning} to Predict
                 Virality of Dance Clips",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "54:1--54:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477533",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477533",
  abstract =     "Dance challenges are going viral in video communities
                 like TikTok nowadays. Once a challenge becomes popular,
                 thousands of short-form videos will be uploaded within
                 a couple of days. Therefore, virality prediction from
                 dance challenges is of great \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhong:2022:DSA,
  author =       "Sheng-Hua Zhong and Jingxu Lin and Jianglin Lu and
                 Ahmed Fares and Tongwei Ren",
  title =        "Deep Semantic and Attentive Network for Unsupervised
                 Video Summarization",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "55:1--55:21",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477538",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477538",
  abstract =     "With the rapid growth of video data, video
                 summarization is a promising approach to shorten a
                 lengthy video into a compact version. Although
                 supervised summarization approaches have achieved
                 state-of-the-art performance, they require frame-level
                 annotated \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2022:MIL,
  author =       "Yawen Zeng and Da Cao and Shaofei Lu and Hanling Zhang
                 and Jiao Xu and Zheng Qin",
  title =        "Moment is Important: Language-Based Video Moment
                 Retrieval via Adversarial Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "56:1--56:21",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3478025",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3478025",
  abstract =     "The newly emerging language-based video moment
                 retrieval task aims at retrieving a target video moment
                 from an untrimmed video given a natural language as the
                 query. It is more applicable in reality since it is
                 able to accurately localize a specific video \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:LTP,
  author =       "Hanjie Wu and Yongtuo Liu and Hongmin Cai and
                 Shengfeng He",
  title =        "Learning Transferable Perturbations for Image
                 Captioning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "57:1--57:18",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3478024",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3478024",
  abstract =     "Present studies have discovered that state-of-the-art
                 deep learning models can be attacked by small but
                 well-designed perturbations. Existing attack algorithms
                 for the image captioning task is time-consuming, and
                 their generated adversarial examples \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2022:SSS,
  author =       "Ziyi Sun and Yunfeng Zhang and Fangxun Bao and Ping
                 Wang and Xunxiang Yao and Caiming Zhang",
  title =        "{SADnet}: Semi-supervised Single Image Dehazing Method
                 Based on an Attention Mechanism",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "58:1--58:23",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3478457",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3478457",
  abstract =     "Many real-life tasks such as military reconnaissance
                 and traffic monitoring require high-quality images.
                 However, images acquired in foggy or hazy weather pose
                 obstacles to the implementation of these real-life
                 tasks; consequently, image dehazing is an \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:TIS,
  author =       "Feifei Zhang and Mingliang Xu and Changsheng Xu",
  title =        "Tell, Imagine, and Search: End-to-end Learning for
                 Composing Text and Image to Image Retrieval",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "59:1--59:23",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3478642",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3478642",
  abstract =     "Composing Text and Image to Image Retrieval ( CTI-IR )
                 is an emerging task in computer vision, which allows
                 retrieving images relevant to a query image with text
                 describing desired modifications to the query image.
                 Most conventional cross-modal retrieval \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ma:2022:SAM,
  author =       "Haoyu Ma and Bingchen Gong and Yizhou Yu",
  title =        "Structure-aware Meta-fusion for Image
                 Super-resolution",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "60:1--60:25",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477553",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477553",
  abstract =     "There are two main categories of image
                 super-resolution algorithms: distortion oriented and
                 perception oriented. Recent evidence shows that
                 reconstruction accuracy and perceptual quality are
                 typically in disagreement with each other. In this
                 article, we \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tahir:2022:NAT,
  author =       "Madiha Tahir and Zahid Halim and Atta Ur Rahman and
                 Muhammad Waqas and Shanshan Tu and Sheng Chen and Zhu
                 Han",
  title =        "Non-Acted Text and Keystrokes Database and Learning
                 Methods to Recognize Emotions",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "61:1--61:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3480968",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3480968",
  abstract =     "The modern computing applications are presently
                 adapting to the convenient availability of huge and
                 diverse data for making their pattern recognition
                 methods smarter. Identification of dominant emotion
                 solely based on the text data generated by humans is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fincato:2022:TWD,
  author =       "Matteo Fincato and Marcella Cornia and Federico Landi
                 and Fabio Cesari and Rita Cucchiara",
  title =        "Transform, Warp, and Dress: a New
                 Transformation-guided Model for Virtual Try-on",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "62:1--62:24",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491226",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491226",
  abstract =     "Virtual try-on has recently emerged in computer vision
                 and multimedia communities with the development of
                 architectures that can generate realistic images of a
                 target person wearing a custom garment. This research
                 interest is motivated by the large role \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2022:AMG,
  author =       "Ning Han and Jingjing Chen and Hao Zhang and Huanwen
                 Wang and Hao Chen",
  title =        "Adversarial Multi-Grained Embedding Network for
                 Cross-Modal Text-Video Retrieval",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "63:1--63:23",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3483381",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3483381",
  abstract =     "Cross-modal retrieval between texts and videos has
                 received consistent research interest in the multimedia
                 community. Existing studies follow a trend of learning
                 a joint embedding space to measure the distance between
                 text and video representations. In \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pang:2022:FUP,
  author =       "Bo Pang and Deming Zhai and Junjun Jiang and Xianming
                 Liu",
  title =        "Fully Unsupervised Person Re-Identification via
                 Selective Contrastive Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "64:1--64:15",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485061",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485061",
  abstract =     "Person re-identification (ReID) aims at searching the
                 same identity person among images captured by various
                 cameras. Existing fully supervised person ReID methods
                 usually suffer from poor generalization capability
                 caused by domain gaps. Unsupervised \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhuang:2022:MAD,
  author =       "Wenlin Zhuang and Congyi Wang and Jinxiang Chai and
                 Yangang Wang and Ming Shao and Siyu Xia",
  title =        "{Music2Dance}: {DanceNet} for Music-Driven Dance
                 Generation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "65:1--65:21",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485664",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485664",
  abstract =     "Synthesize human motions from music (i.e., music to
                 dance) is appealing and has attracted lots of research
                 interests in recent years. It is challenging because of
                 the requirement for realistic and complex human motions
                 for dance, but more importantly, the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cetinic:2022:UCA,
  author =       "Eva Cetinic and James She",
  title =        "Understanding and Creating Art with {AI}: Review and
                 Outlook",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2",
  pages =        "66:1--66:22",
  month =        may,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3475799",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:54 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3475799",
  abstract =     "Technologies related to artificial intelligence (AI)
                 have a strong impact on the changes of research and
                 creative practices in visual arts. The growing number
                 of research initiatives and creative applications that
                 emerge in the intersection of AI and art \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:DVS,
  author =       "Zheng Zhang and Jianning Wang and Lei Zhu and
                 Guangming Lu",
  title =        "Discriminative Visual Similarity Search with
                 Semantically Cycle-consistent Hashing Networks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "114:1--114:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532519",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532519",
  abstract =     "Deep hashing has great potential in large-scale visual
                 similarity search due to its preferable efficiency in
                 storage and computation. Technically, deep \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "114",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ge:2022:DVD,
  author =       "Shiming Ge and Fanzhao Lin and Chenyu Li and Daichi
                 Zhang and Weiping Wang and Dan Zeng",
  title =        "Deepfake Video Detection via Predictive Representation
                 Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "115:1--115:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3536426",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3536426",
  abstract =     "Increasingly advanced deepfake approaches have made
                 the detection of deepfake videos very challenging. We
                 observe that the general deepfake videos often
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "115",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Galteri:2022:LLB,
  author =       "Leonardo Galteri and Lorenzo Seidenari and Pietro
                 Bongini and Marco Bertini and Alberto {Del Bimbo}",
  title =        "{LANBIQUE}: {LANguage-based Blind Image QUality
                 Evaluation}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "116:1--116:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538649",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538649",
  abstract =     "Image quality assessment is often performed with deep
                 networks that are fine-tuned to regress a human
                 provided quality score of a given image. Usually, this
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "116",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2022:SCC,
  author =       "Zhihan Lv and Dongliang Chen and Haibin Lv",
  title =        "Smart City Construction and Management by Digital
                 Twins and {BIM} Big Data in {COVID-19} Scenario",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "117:1--117:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3529395",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3529395",
  abstract =     "With the rapid development of information technology
                 and the spread of Corona Virus Disease 2019 (COVID-19),
                 the government and urban managers are looking
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Anand:2022:CSD,
  author =       "Ashima Anand and Amit Kumar Singh",
  title =        "A Comprehensive Study of Deep Learning-based Covert
                 Communication",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "118:1--118:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3508365",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3508365",
  abstract =     "Deep learning-based methods have been popular in
                 multimedia analysis tasks, including classification,
                 detection, segmentation, and so on. In \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "118",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2022:EAC,
  author =       "Haotian Xu and Xiaobo Jin and Qiufeng Wang and Amir
                 Hussain and Kaizhu Huang",
  title =        "Exploiting Attention-Consistency Loss For
                 Spatial-Temporal Stream Action Recognition",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "119:1--119:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538749",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538749",
  abstract =     "Currently, many action recognition methods mostly
                 consider the information from spatial streams. We
                 propose a new perspective inspired by the human visual
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "119",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Salim:2022:PED,
  author =       "Sara Salim and Nour Moustafa and Benjamin Turnbull and
                 Imran Razzak",
  title =        "Perturbation-enabled Deep Federated Learning for
                 Preserving {Internet of Things}-based Social Networks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "120:1--120:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3537899",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3537899",
  abstract =     "Federated Learning (FL), as an emerging form of
                 distributed machine learning (ML), can protect
                 participants' private data from being substantially
                 disclosed to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "120",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bi:2022:DTE,
  author =       "An-Qi Bi and Xiao-Yang Tian and Shui-Hua Wang and
                 Yu-Dong Zhang",
  title =        "Dynamic Transfer Exemplar based Facial Emotion
                 Recognition Model Toward Online Video",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "121:1--121:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538385",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538385",
  abstract =     "In this article, we focus on the dynamic facial
                 emotion recognition from online video. We combine deep
                 neural networks with transfer learning theory and
                 propose a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "121",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Golmaryami:2022:SSS,
  author =       "Marjan Golmaryami and Rahim Taheri and Zahra Pooranian
                 and Mohammad Shojafar and Pei Xiao",
  title =        "{SETTI}: a {Self-supervised AdvErsarial Malware
                 DeTection ArchiTecture in an IoT} Environment",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "122:1--122:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3536425",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3536425",
  abstract =     "In recent years, malware detection has become an
                 active research topic in the area of Internet of Things
                 (IoT) security. The principle is to exploit knowledge
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "122",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Khan:2022:PPM,
  author =       "Abbas Khan and Ijaz {Ul Haq} and Tanveer Hussain and
                 Khan Muhammad and Mohammad Hijji and Muhammad Sajjad
                 and Victor Hugo C. {De Albuquerque} and Sung Wook
                 Baik",
  title =        "{PMAL}: a Proxy Model Active Learning Approach for
                 Vision Based Industrial Applications",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "123:1--123:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3534932",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3534932",
  abstract =     "Deep Learning models' performance strongly correlate
                 with availability of annotated data; however, massive
                 data labeling is laborious, expensive, and error-prone
                 when \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "123",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2022:DQN,
  author =       "Chenyi Yang and Xiaolong Xu and Xiaokang Zhou and
                 Lianyong Qi",
  title =        "{Deep Q} Network-Driven Task Offloading for Efficient
                 Multimedia Data Analysis in Edge Computing-Assisted
                 {IoV}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "124:1--124:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548687",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548687",
  abstract =     "With the prosperity of Industry 4.0, numerous emerging
                 industries continue to gain popularity and their market
                 scales are expanding ceaselessly. The \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "124",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tiwari:2022:ODN,
  author =       "Arti Tiwari and Millie Pant",
  title =        "Optimized Deep-Neural Network for Content-based
                 Medical Image Retrieval in a Brownfield {IoMT}
                 Network",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "125:1--125:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546194",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546194",
  abstract =     "In this paper, a brownfield Internet of Medical Things
                 network is introduced for imaging data that can be
                 easily scaled out depending on the objectives,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "125",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2022:SFM,
  author =       "Wei Huang and Yuze Zhang and Shaohua Wan",
  title =        "A Sorting Fuzzy Min-Max Model in an Embedded System
                 for Atrial Fibrillation Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "126:1--126:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3554737",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3554737",
  abstract =     "Atrial fibrillation detection (AFD) has attracted much
                 attention in the field of embedded systems. In this
                 study, we propose a sorting fuzzy min-max \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "126",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2022:ISS,
  author =       "Xun Yang and Liang Zheng and Elisa Ricci and Meng
                 Wang",
  title =        "Introduction to the Special Section on Learning
                 Representations, Similarity, and Associations in
                 Dynamic Multimedia Environments",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "127:1--127:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569952",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569952",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127e",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{He:2022:RLD,
  author =       "Jun He and Richang Hong and Xueliang Liu and Mingliang
                 Xu and Qianru Sun",
  title =        "Revisiting Local Descriptor for Improved Few-Shot
                 Classification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "127:1--127:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3511917",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511917",
  abstract =     "Few-shot classification studies the problem of quickly
                 adapting a deep learner to understanding novel classes
                 based on few support images. In this context,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jiao:2022:GGL,
  author =       "Yingying Jiao and Haipeng Chen and Runyang Feng and
                 Haoming Chen and Sifan Wu and Yifang Yin and Zhenguang
                 Liu",
  title =        "{GLPose}: Global-Local Representation Learning for
                 Human Pose Estimation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "128:1--128:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519305",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519305",
  abstract =     "Multi-frame human pose estimation is at the core of
                 many computer vision tasks. Although state-of-the-art
                 approaches have demonstrated remarkable results
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "128",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2022:STS,
  author =       "Qing Han and Huiting Liu and Weidong Min and Tiemei
                 Huang and Deyu Lin and Qi Wang",
  title =        "{$3$D} Skeleton and Two Streams Approach to Person
                 Re-identification Using Optimized Region Matching",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "129:1--129:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538490",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538490",
  abstract =     "Person re-identification (Re-ID) is a challenging and
                 arduous task due to non-overlapping views, complex
                 background, and uncontrollable occlusion in video
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "129",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2022:RRL,
  author =       "Xin Xu and Xin Yuan and Zheng Wang and Kai Zhang and
                 Ruimin Hu",
  title =        "Rank-in-Rank Loss for Person Re-identification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "130:1--130:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532866",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532866",
  abstract =     "Person re-identification (re-ID) is commonly
                 investigated as a ranking problem. However, the
                 performance of existing re-ID models drops
                 dramatically, when they \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "130",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:GGA,
  author =       "Kunpeng Li and Chang Liu and Mike Stopa and Jun Amano
                 and Yun Fu",
  title =        "Guided Graph Attention Learning for Video-Text
                 Matching",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "131:1--131:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538533",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538533",
  abstract =     "As a bridge between videos and natural languages,
                 video-text matching has been a hot multimedia research
                 topic in recent years. Such cross-modal retrieval
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "131",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Biondi:2022:CRC,
  author =       "Niccol{\'o} Biondi and Federico Pernici and Matteo
                 Bruni and Daniele Mugnai and Alberto Del Bimbo",
  title =        "{CL$^2$R}: Compatible Lifelong Learning
                 Representations",
  journal =      j-TOMM,
  volume =       "18",
  number =       "2s",
  pages =        "132:1--132:??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3564786",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3564786",
  abstract =     "In this article, we propose a method to partially
                 mimic natural intelligence for the problem of lifelong
                 learning representations that are compatible. We take
                 the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "132",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pan:2022:CIK,
  author =       "Yonghua Pan and Zechao Li and Liyan Zhang and Jinhui
                 Tang",
  title =        "Causal Inference with Knowledge Distilling and
                 Curriculum Learning for Unbiased {VQA}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "67:1--67:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487042",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487042",
  abstract =     "Recently, many Visual Question Answering (VQA) models
                 rely on the correlations between questions and answers
                 yet neglect those between the visual information and
                 the textual information. They would perform badly if
                 the handled data distribute differently \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yanagi:2022:IRR,
  author =       "Rintaro Yanagi and Ren Togo and Takahiro Ogawa and
                 Miki Haseyama",
  title =        "Interactive Re-ranking via Object Entropy-Guided
                 Question Answering for Cross-Modal Image Retrieval",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "68:1--68:17",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485042",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485042",
  abstract =     "Cross-modal image-retrieval methods retrieve desired
                 images from a query text by learning relationships
                 between texts and images. Such a retrieval approach is
                 one of the most effective ways of achieving the
                 easiness of query preparation. Recent cross-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2022:SIN,
  author =       "Qinghongya Shi and Hong-Bo Zhang and Zhe Li and
                 Ji-Xiang Du and Qing Lei and Jing-Hua Liu",
  title =        "Shuffle-invariant Network for Action Recognition in
                 Videos",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "69:1--69:18",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485665",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485665",
  abstract =     "The local key features in video are important for
                 improving the accuracy of human action recognition.
                 However, most end-to-end methods focus on global
                 feature learning from videos, while few works consider
                 the enhancement of the local information in a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2022:LAS,
  author =       "Di Yuan and Xiaojun Chang and Zhihui Li and Zhenyu
                 He",
  title =        "Learning Adaptive Spatial-Temporal Context-Aware
                 Correlation Filters for {UAV} Tracking",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "70:1--70:18",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3486678",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3486678",
  abstract =     "Tracking in the unmanned aerial vehicle (UAV)
                 scenarios is one of the main components of
                 target-tracking tasks. Different from the
                 target-tracking task in the general scenarios, the
                 target-tracking task in the UAV scenarios is very
                 challenging because of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2022:ESR,
  author =       "Guofei Sun and Yongkang Wong and Mohan S. Kankanhalli
                 and Xiangdong Li and Weidong Geng",
  title =        "Enhanced {$3$D} Shape Reconstruction With Knowledge
                 Graph of Category Concept",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "71:1--71:20",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491224",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491224",
  abstract =     "Reconstructing three-dimensional (3D) objects from
                 images has attracted increasing attention due to its
                 wide applications in computer vision and robotic tasks.
                 Despite the promising progress of recent deep
                 learning-based approaches, which directly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:DIG,
  author =       "Jinfeng Li and Weifeng Liu and Yicong Zhou and Jun Yu
                 and Dapeng Tao and Changsheng Xu",
  title =        "Domain-invariant Graph for Adaptive Semi-supervised
                 Domain Adaptation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "72:1--72:18",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487194",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487194",
  abstract =     "Domain adaptation aims to generalize a model from a
                 source domain to tackle tasks in a related but
                 different target domain. Traditional domain adaptation
                 algorithms assume that enough labeled data, which are
                 treated as the prior knowledge are available in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2022:OOS,
  author =       "Ran Shi and Jing Ma and King Ngi Ngan and Jian Xiong
                 and Tong Qiao",
  title =        "Objective Object Segmentation Visual Quality
                 Evaluation: Quality Measure and Pooling Method",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "73:1--73:19",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491229",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491229",
  abstract =     "Objective object segmentation visual quality
                 evaluation is an emergent member of the visual quality
                 assessment family. It aims to develop an objective
                 measure instead of a subjective survey to evaluate the
                 object segmentation quality in agreement with
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2022:CAS,
  author =       "Linghua Zeng and Xinmei Tian",
  title =        "{CRAR}: Accelerating Stereo Matching with Cascaded
                 Residual Regression and Adaptive Refinement",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "74:1--74:19",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3488719",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3488719",
  abstract =     "Dense stereo matching estimates the depth for each
                 pixel of the referenced images. Recently, deep learning
                 algorithms have dramatically promoted the development
                 of stereo matching. The state-of-the-art result is
                 achieved by models adopting deep \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yao:2022:RGA,
  author =       "Lingxiang Yao and Worapan Kusakunniran and Qiang Wu
                 and Jingsong Xu and Jian Zhang",
  title =        "Recognizing Gaits Across Walking and Running Speeds",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "75:1--75:22",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3488715",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3488715",
  abstract =     "For decades, very few methods were proposed for
                 cross-mode (i.e., walking vs. running) gait
                 recognition. Thus, it remains largely unexplored
                 regarding how to recognize persons by the way they walk
                 and run. Existing cross-mode methods handle the
                 walking-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:IKB,
  author =       "Qun Li and Fu Xiao and Bir Bhanu and Biyun Sheng and
                 Richang Hong",
  title =        "Inner Knowledge-based {Img2Doc} Scheme for Visual
                 Question Answering",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "76:1--76:21",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3489142",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3489142",
  abstract =     "Visual Question Answering (VQA) is a research topic of
                 significant interest at the intersection of computer
                 vision and natural language understanding. Recent
                 research indicates that attributes and knowledge can
                 effectively improve performance for both \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cornia:2022:MFA,
  author =       "Marcella Cornia and Matteo Tomei and Lorenzo Baraldi
                 and Rita Cucchiara",
  title =        "Matching Faces and Attributes Between the Artistic and
                 the Real Domain: the {PersonArt} Approach",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "77:1--77:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3490033",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3490033",
  abstract =     "In this article, we present an approach for retrieving
                 similar faces between the artistic and the real domain.
                 The application we refer to is an interactive
                 exhibition inside a museum, in which a visitor can take
                 a photo of himself and search for a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yin:2022:MFL,
  author =       "Guanghao Yin and Shouqian Sun and Dian Yu and Dejian
                 Li and Kejun Zhang",
  title =        "A Multimodal Framework for Large-Scale Emotion
                 Recognition by Fusing Music and Electrodermal Activity
                 Signals",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "78:1--78:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3490686",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3490686",
  abstract =     "Considerable attention has been paid to physiological
                 signal-based emotion recognition in the field of
                 affective computing. For reliability and user-friendly
                 acquisition, electrodermal activity (EDA) has a great
                 advantage in practical applications. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Buckchash:2022:GLG,
  author =       "Himanshu Buckchash and Balasubramanian Raman",
  title =        "{GraSP}: Local {Grassmannian} Spatio-Temporal Patterns
                 for Unsupervised Pose Sequence Recognition",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "79:1--79:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491227",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491227",
  abstract =     "Many applications of action recognition, especially
                 broad domains like surveillance or anomaly-detection,
                 favor unsupervised methods considering that exhaustive
                 labeling of actions is not possible. However, very
                 limited work has happened in this domain. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2022:SSR,
  author =       "Xiaoguang Zhu and Ye Zhu and Haoyu Wang and Honglin
                 Wen and Yan Yan and Peilin Liu",
  title =        "Skeleton Sequence and {RGB} Frame Based Multi-Modality
                 Feature Fusion Network for Action Recognition",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "80:1--80:24",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491228",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491228",
  abstract =     "Action recognition has been a heated topic in computer
                 vision for its wide application in vision systems.
                 Previous approaches achieve improvement by fusing the
                 modalities of the skeleton sequence and RGB video.
                 However, such methods pose a dilemma between \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chowdhury:2022:DGS,
  author =       "Debanjan Roy Chowdhury and Sukumar Nandi and Diganta
                 Goswami",
  title =        "Distributed Gateway Selection for Video Streaming in
                 {VANET} Using {IP} Multicast",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "81:1--81:24",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491388",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491388",
  abstract =     "The volume of video traffic as infotainment service
                 over vehicular ad hoc network (VANET) has rapidly
                 increased for past few years. Providing video streaming
                 as VANET infotainment service is very challenging
                 because of high mobility and heterogeneity of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alaya:2022:MVE,
  author =       "Bechir Alaya and Lamaa Sellami",
  title =        "Multilayer Video Encoding for {QoS} Managing of Video
                 Streaming in {VANET} Environment",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "82:1--82:19",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491433",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491433",
  abstract =     "Efficient delivery and maintenance of the quality of
                 service (QoS) of audio/video streams transmitted over
                 VANETs for mobile and heterogeneous nodes are one of
                 the major challenges in the convergence of this network
                 type and these services. In this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:WPM,
  author =       "Yike Wu and Shiwan Zhao and Ying Zhang and Xiaojie
                 Yuan and Zhong Su",
  title =        "When Pairs Meet Triplets: Improving Low-Resource
                 Captioning via Multi-Objective Optimization",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "83:1--83:20",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3492325",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3492325",
  abstract =     "Image captioning for low-resource languages has
                 attracted much attention recently. Researchers propose
                 to augment the low-resource caption dataset into
                 (image, rich-resource language, and low-resource
                 language) triplets and develop the dual attention
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2022:ICD,
  author =       "Kai-Wei Yang and Yen-Yun Huang and Jen-Wei Huang and
                 Ya-Rou Hsu and Chang-Lin Wan and Hong-Han Shuai and
                 Li-Chun Wang and Wen-Huang Cheng",
  title =        "Improving Crowd Density Estimation by Fusing Aerial
                 Images and Radio Signals",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "84:1--84:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3492346",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3492346",
  abstract =     "A recent line of research focuses on crowd density
                 estimation from RGB images for a variety of
                 applications, for example, surveillance and traffic
                 flow control. The performance drops dramatically for
                 low-quality images, such as occlusion, or poor light
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xia:2022:FCS,
  author =       "Zhihua Xia and Qiuju Ji and Qi Gu and Chengsheng Yuan
                 and Fengjun Xiao",
  title =        "A Format-compatible Searchable Encryption Scheme for
                 {JPEG} Images Using Bag-of-words",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "85:1--85:18",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3492705",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3492705",
  abstract =     "The development of cloud computing attracts
                 enterprises and individuals to outsource their data,
                 such as images, to the cloud server. However, direct
                 outsourcing causes the extensive concern of privacy
                 leakage, as images often contain rich sensitive
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Natgunanathan:2022:BBA,
  author =       "Iynkaran Natgunanathan and Purathani Praitheeshan and
                 Longxiang Gao and Yong Xiang and Lei Pan",
  title =        "Blockchain-Based Audio Watermarking Technique for
                 Multimedia Copyright Protection in Distribution
                 Networks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "86:1--86:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3492803",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3492803",
  abstract =     "Copyright protection in multimedia protection
                 distribution is a challenging problem. To protect
                 multimedia data, many watermarking methods have been
                 proposed in the literature. However, most of them
                 cannot be used effectively in a multimedia distribution
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2022:DIE,
  author =       "Kehua Guo and Min Hu and Sheng Ren and Fangfang Li and
                 Jian Zhang and Haifu Guo and Xiaoyan Kui",
  title =        "Deep Illumination-Enhanced Face Super-Resolution
                 Network for Low-Light Images",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "87:1--87:19",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3495258",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495258",
  abstract =     "Face images are typically a key component in the
                 fields of security and criminal investigation. However,
                 due to lighting and shooting angles, faces taken under
                 low-light conditions are often difficult to recognize.
                 Face super-resolution (FSR) technology \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:SSM,
  author =       "Xiaoming Liu and Shuo Wang and Ying Zhang and Quan
                 Yuan",
  title =        "Scribble-Supervised Meibomian Glands Segmentation in
                 Infrared Images",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "88:1--88:23",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3497747",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3497747",
  abstract =     "Infrared imaging is currently the most effective
                 clinical method to evaluate the morphology of the
                 meibomian glands (MGs) in patients. As an important
                 indicator for monitoring the development of MG
                 dysfunction, it is necessary to accurately measure
                 gland-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2022:TII,
  author =       "Kedar Nath Singh and Amit Kumar Singh",
  title =        "Towards Integrating Image Encryption with Compression:
                 a Survey",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3",
  pages =        "89:1--89:21",
  month =        aug,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3498342",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:55 MDT 2022",
  bibsource =    "bhttps://www.math.utah.edu/pub/tex/bib/tomccap.bib;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3498342",
  abstract =     "As digital images are consistently generated and
                 transmitted online, the unauthorized utilization of
                 these images is an increasing concern that has a
                 significant impact on both security and privacy issues;
                 additionally, the representation of digital \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{MontenegroMarin:2022:ISI,
  author =       "Carlos Enrique {Montenegro Marin} and Dinesh Jackson
                 Samuel and Nallappan Gunasekaran",
  title =        "Introduction to the Special Issue on {6G} Enabled
                 Interactive Multimedia Communication Systems",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "133:1--133:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567835",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567835",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133e",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:CAP,
  author =       "Ran Li and Wei Wei and Peinan Hao and Jian Su and
                 Fengyuan Sun",
  title =        "Context-aware Pseudo-true Video Interpolation at {6G}
                 Edge",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "133:1--133:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555313",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555313",
  abstract =     "In the 6G network, lots of edge devices facilitate the
                 low-latency transmission of video. However, with
                 limited processing and storage capabilities, the edge
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alharbi:2022:NSA,
  author =       "Abdullah Alharbi and Mohammed Aljebreen and Amr Tolba
                 and Konstantinos A. Lizos and Saied Abd El-Atty and
                 Farid Shawki",
  title =        "A Normalized Slicing-assigned Virtualization Method
                 for 6G-based Wireless Communication Systems",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "134:1--134:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546077",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546077",
  abstract =     "The next generation of wireless communication systems
                 will rely on advantageous sixth-generation wireless
                 network (6G) features and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "134",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:ISIb,
  author =       "Yin Zhang and Iztok Humar and Jia Liu and Alireza
                 Jolfaei",
  title =        "Introduction to the Special Issue on Affective
                 Services based on Representation Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "135:1--135:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567836",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567836",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "135e",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2022:DBJ,
  author =       "Kexin Xu and Haijun Zhang and Keping Long and Jianquan
                 Wang and Lei Sun",
  title =        "{DRL} based Joint Affective Services Computing and
                 Resource Allocation in {ISTN}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "135:1--135:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561821",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561821",
  abstract =     "Affective services will become a research hotspot in
                 artificial intelligence (AI) in the next decade. In
                 this paper, a novel service paradigm combined with
                 wireless \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "135",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:AIA,
  author =       "Yazhou Zhang and Prayag Tiwari and Lu Rong and Rui
                 Chen and Nojoom A. Alnajem and M. Shamim Hossain",
  title =        "Affective Interaction: Attentive Representation
                 Learning for Multi-Modal Sentiment Classification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "136:1--136:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527175",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527175",
  abstract =     "The recent booming of artificial intelligence (AI)
                 applications, e.g., affective robots, human-machine
                 interfaces, autonomous vehicles, and so on, has
                 produced \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "136",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:BRJ,
  author =       "Xiaoqin Wang and Chen Chen and Rushi Lan and Licheng
                 Liu and Zhenbing Liu and Huiyu Zhou and Xiaonan Luo",
  title =        "Binary Representation via Jointly Personalized Sparse
                 Hashing",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "137:1--137:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3558769",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3558769",
  abstract =     "Unsupervised hashing has attracted much attention for
                 binary representation learning due to the requirement
                 of economical storage and efficiency of binary
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "137",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jin:2022:AAA,
  author =       "Xin Jin and Xinning Li and Hao Lou and Chenyu Fan and
                 Qiang Deng and Chaoen Xiao and Shuai Cui and Amit Kumar
                 Singh",
  title =        "Aesthetic Attribute Assessment of Images Numerically
                 on Mixed Multi-attribute Datasets",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "138:1--138:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547144",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547144",
  abstract =     "With the continuous development of social software and
                 multimedia technology, images have become a kind of
                 important carrier for spreading information and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "138",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cao:2022:SBH,
  author =       "Jie Cao and Youquan Wang and Haicheng Tao and Xiang
                 Guo",
  title =        "Sensor-based Human Activity Recognition Using Graph
                 {LSTM} and Multi-task Classification Model",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "139:1--139:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561387",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561387",
  abstract =     "This paper explores human activities recognition from
                 sensor-based multi-dimensional streams. Recently, deep
                 learning-based methods such as \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "139",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2022:OTV,
  author =       "Jiawei Huang and Qichen Su and Weihe Li and Zhuoran
                 Liu and Tao Zhang and Sen Liu and Ping Zhong and
                 Wanchun Jiang and Jianxin Wang",
  title =        "Opportunistic Transmission for Video Streaming over
                 Wild {Internet}",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "140:1--140:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3488722",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3488722",
  abstract =     "The video streaming system employs adaptive bitrate
                 (ABR) algorithms to optimize a user's quality of
                 experience. However, it is hard for ABR \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "140",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duanmu:2022:BQE,
  author =       "Zhengfang Duanmu and Wentao Liu and Diqi Chen and
                 Zhuoran Li and Zhou Wang and Yizhou Wang and Wen Gao",
  title =        "A {Bayesian} Quality-of-Experience Model for Adaptive
                 Streaming Videos",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "141:1--141:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491432",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491432",
  abstract =     "The fundamental conflict between the enormous space of
                 adaptive streaming videos and the limited capacity for
                 subjective experiment casts significant \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "141",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ignat:2022:WDI,
  author =       "Oana Ignat and Santiago Castro and Yuhang Zhou and
                 Jiajun Bao and Dandan Shan and Rada Mihalcea",
  title =        "When Did It Happen? {Duration}-informed Temporal
                 Localization of Narrated Actions in Vlogs",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "142:1--142:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3495211",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495211",
  abstract =     "We consider the task of temporal human action
                 localization in lifestyle vlogs. We introduce a novel
                 dataset consisting of manual annotations of temporal
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "142",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2022:HMU,
  author =       "Wuzhen Shi and Shaohui Liu",
  title =        "Hiding Message Using a Cycle Generative Adversarial
                 Network",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "143:1--143:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3495566",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495566",
  abstract =     "Training an image steganography is an unsupervised
                 problem, because it is impossible to obtain an ideal
                 supervised steganographic image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "143",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hui:2022:STC,
  author =       "Chen Hui and Shaohui Liu and Wuzhen Shi and Feng Jiang
                 and Debin Zhao",
  title =        "Spatio-Temporal Context Based Adaptive Camcorder
                 Recording Watermarking",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "144:1--144:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3503160",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503160",
  abstract =     "Video watermarking technology has attracted increasing
                 attention in the past few years, and a great deal of
                 traditional and deep learning-based methods \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "144",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2022:BAP,
  author =       "Jian Zhao and Xianhui Liu and Weidong Zhao",
  title =        "Balanced and Accurate Pseudo-Labels for
                 Semi-Supervised Image Classification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "145:1--145:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506711",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506711",
  abstract =     "Image classification by semi-supervised learning has
                 recently become a hot spot, and the Co-Training
                 framework is an important method of semi-supervised
                 image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "145",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Stacchio:2022:THA,
  author =       "Lorenzo Stacchio and Alessia Angeli and Giuseppe
                 Lisanti and Daniela Calanca and Gustavo Marfia",
  title =        "Toward a Holistic Approach to the Socio-historical
                 Analysis of Vernacular Photos",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "146:1--146:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3507918",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3507918",
  abstract =     "Although one of the most popular practices in
                 photography since the end of the 19th century, an
                 increase in scholarly interest in family photo albums
                 dates back to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "146",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiao:2022:DAS,
  author =       "Hui-Chu Xiao and Wan-Lei Zhao and Jie Lin and Yi-Geng
                 Hong and Chong-Wah Ngo",
  title =        "Deeply Activated Salient Region for Instance Search",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "147:1--147:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510004",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510004",
  abstract =     "The performance of instance search relies heavily on
                 the ability to locate and describe a wide variety of
                 object instances in a video/image collection. Due to
                 the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "147",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:CEC,
  author =       "Zuquan Liu and Guopu Zhu and Feng Ding and Xiangyang
                 Luo and Sam Kwong and Peng Li",
  title =        "Contrast-Enhanced Color Visual Cryptography for $ (k,
                 n) $ Threshold Schemes",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "148:1--148:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3508394",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3508394",
  abstract =     "In traditional visual cryptography schemes (VCSs),
                 pixel expansion remains to be an unsolved challenge. To
                 alleviate the impact of pixel expansion, several
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "148",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:DSS,
  author =       "Zhe Liu and Xian-Hua Han",
  title =        "Deep Self-Supervised Hyperspectral Image
                 Reconstruction",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "149:1--149:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510373",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510373",
  abstract =     "Reconstructing a high-resolution hyperspectral (HR-HS)
                 image via merging a low-resolution hyperspectral
                 (LR-HS) image and a high-resolution RGB \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "149",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2022:SSD,
  author =       "Gurinder Singh and Puneet Goyal",
  title =        "{SDCN2}: a Shallow Densely Connected {CNN} for
                 Multi-Purpose Image Manipulation Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "150:1--150:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510462",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510462",
  abstract =     "Digital image information can be easily tampered with
                 to harm the integrity of someone. Thus, recognizing the
                 truthfulness and processing history of an image
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "150",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:SGS,
  author =       "Yunfei Liu and Yu Li and Shaodi You and Feng Lu",
  title =        "Semantic Guided Single Image Reflection Removal",
  journal =      j-TOMM,
  volume =       "18",
  number =       "3s",
  pages =        "151:1--151:??",
  month =        oct,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510821",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:31 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510821",
  abstract =     "Reflection is common when we see through a glass
                 window, which not only is a visual disturbance but also
                 influences the performance of computer vision
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "151",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:IFD,
  author =       "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun
                 Chen and Yimin Liu",
  title =        "Improving Feature Discrimination for Object Tracking
                 by Structural-similarity-based Metric Learning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "90:1--90:23",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3497746",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3497746",
  abstract =     "Existing approaches usually form the tracking task as
                 an appearance matching procedure. However, the
                 discrimination ability of appearance features is
                 insufficient in these trackers, which is caused by
                 their weak feature supervision constraints and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2022:IBP,
  author =       "Xiaowen Huang and Jitao Sang and Changsheng Xu",
  title =        "Image-Based Personality Questionnaire Design",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "91:1--91:??",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3503489",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:32 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503489",
  abstract =     "This article explores the problem of image-based
                 personality questionnaire design. Compared with the
                 traditional text-based personality questionnaire, the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hao:2022:DLL,
  author =       "Shijie Hao and Xu Han and Yanrong Guo and Meng Wang",
  title =        "Decoupled Low-Light Image Enhancement",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "92:1--92:19",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3498341",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3498341",
  abstract =     "The visual quality of photographs taken under
                 imperfect lightness conditions can be degenerated by
                 multiple factors, e.g., low lightness, imaging noise,
                 color distortion, and so on. Current low-light image
                 enhancement models focus on the improvement of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:AQR,
  author =       "Yibing Liu and Yangyang Guo and Jianhua Yin and
                 Xuemeng Song and Weifeng Liu and Liqiang Nie and Min
                 Zhang",
  title =        "Answer Questions with Right Image Regions: a Visual
                 Attention Regularization Approach",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "93:1--93:18",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3498340",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3498340",
  abstract =     "Visual attention in Visual Question Answering (VQA)
                 targets at locating the right image regions regarding
                 the answer prediction, offering a powerful technique to
                 promote multi-modal understanding. However, recent
                 studies have pointed out that the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yu:2022:DAM,
  author =       "Yang Yu and Rongrong Ni and Wenjie Li and Yao Zhao",
  title =        "Detection of {AI-Manipulated} Fake Faces via Mining
                 Generalized Features",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "94:1--94:23",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3499026",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3499026",
  abstract =     "Recently, AI-manipulated face techniques have
                 developed rapidly and constantly, which has raised new
                 security issues in society. Although existing detection
                 methods consider different categories of fake faces,
                 the performance on detecting the fake faces \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cheng:2022:CMG,
  author =       "Yuhao Cheng and Xiaoguang Zhu and Jiuchao Qian and Fei
                 Wen and Peilin Liu",
  title =        "Cross-modal Graph Matching Network for Image-text
                 Retrieval",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "95:1--95:23",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3499027",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3499027",
  abstract =     "Image-text retrieval is a fundamental cross-modal task
                 whose main idea is to learn image-text matching.
                 Generally, according to whether there exist
                 interactions during the retrieval process, existing
                 image-text retrieval methods can be classified into
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dogariu:2022:GRS,
  author =       "Mihai Dogariu and Liviu-Daniel {\c{S}}tefan and Bogdan
                 Andrei Boteanu and Claudiu Lamba and Bomi Kim and
                 Bogdan Ionescu",
  title =        "Generation of Realistic Synthetic Financial
                 Time-series",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "96:1--96:27",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501305",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501305",
  abstract =     "Financial markets have always been a point of interest
                 for automated systems. Due to their complex nature,
                 financial algorithms and fintech frameworks require
                 vast amounts of data to accurately respond to market
                 fluctuations. This data availability is \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2022:CMS,
  author =       "Yi Zheng and Yong Zhou and Jiaqi Zhao and Ying Chen
                 and Rui Yao and Bing Liu and Abdulmotaleb {El Saddik}",
  title =        "Clustering Matters: Sphere Feature for Fully
                 Unsupervised Person Re-identification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "97:1--97:18",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501404",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501404",
  abstract =     "In person re-identification (Re-ID), the data
                 annotation cost of supervised learning, is huge and it
                 cannot adapt well to complex situations. Therefore,
                 compared with supervised deep learning methods,
                 unsupervised methods are more in line with actual
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2022:HMB,
  author =       "Zengming Tang and Jun Huang",
  title =        "Harmonious Multi-branch Network for Person
                 Re-identification with Harder Triplet Loss",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "98:1--98:21",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501405",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501405",
  abstract =     "Recently, advances in person re-identification (Re-ID)
                 has benefitted from use of the popular multi-branch
                 network. However, performing feature learning in a
                 single branch with uniform partitioning is likely to
                 separate meaningful local regions, and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2022:TCA,
  author =       "Yifan Xu and Kekai Sheng and Weiming Dong and Baoyuan
                 Wu and Changsheng Xu and Bao-Gang Hu",
  title =        "Towards Corruption-Agnostic Robust Domain Adaptation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "99:1--99:16",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501800",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501800",
  abstract =     "Great progress has been achieved in domain adaptation
                 in decades. Existing works are always based on an ideal
                 assumption that testing target domains are independent
                 and identically distributed with training target
                 domains. However, due to unpredictable \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2022:JSC,
  author =       "Jinzhi Lin and Yun Zhang and Na Li and Hongling
                 Jiang",
  title =        "Joint Source-Channel Decoding of Polar Codes for
                 {HEVC}-Based Video Streaming",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "100:1--100:23",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3502208",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3502208",
  abstract =     "Ultra High-Definition (UHD) and Virtual Reality (VR)
                 video streaming over 5G networks are emerging, in which
                 High-Efficiency Video Coding (HEVC) is used as source
                 coding to compress videos more efficiently and polar
                 code is used as channel coding to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2022:DES,
  author =       "Yongrui Li and Zengfu Wang and Jun Yu",
  title =        "Densely Enhanced Semantic Network for Conversation
                 System in Social Media",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "101:1--101:24",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501799",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501799",
  abstract =     "The human-computer conversation system is a
                 significant application in the field of multimedia. To
                 select an appropriate response, retrieval-based systems
                 model the matching between the dialogue history and
                 response candidates. However, most of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2022:NCN,
  author =       "Kai Lin and Chuanmin Jia and Xinfeng Zhang and Shanshe
                 Wang and Siwei Ma and Wen Gao",
  title =        "{NR-CNN}: Nested-Residual Guided {CNN} In-loop
                 Filtering for Video Coding",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "102:1--102:22",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3502723",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3502723",
  abstract =     "Recently, deep learning for video coding, such as deep
                 predictive coding, deep transform coding, and deep
                 in-loop filtering, has been an emerging research area.
                 The coding gain of hybrid coding framework could be
                 extensively promoted by the data-driven \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dai:2022:FFS,
  author =       "Hanbin Dai and Hailin Shi and Wu Liu and Linfang Wang
                 and Yinglu Liu and Tao Mei",
  title =        "{FasterPose}: a Faster Simple Baseline for Human Pose
                 Estimation",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "103:1--103:16",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3503464",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503464",
  abstract =     "The performance of human pose estimation depends on
                 the spatial accuracy of keypoint localization. Most
                 existing methods pursue the spatial accuracy through
                 learning the high-resolution (HR) representation from
                 input images. By the experimental analysis, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Man:2022:SAR,
  author =       "Xin Man and Deqiang Ouyang and Xiangpeng Li and
                 Jingkuan Song and Jie Shao",
  title =        "Scenario-Aware Recurrent Transformer for Goal-Directed
                 Video Captioning",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "104:1--104:17",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3503927",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503927",
  abstract =     "Fully mining visual cues to aid in content
                 understanding is crucial for video captioning. However,
                 most state-of-the-art video captioning methods are
                 limited to generating captions purely based on
                 straightforward information while ignoring the scenario
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "104",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2022:OCC,
  author =       "Tianjun Zhang and Hao Deng and Lin Zhang and Shengjie
                 Zhao and Xiao Liu and Yicong Zhou",
  title =        "Online Correction of Camera Poses for the
                 Surround-view System: a Sparse Direct Approach",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "106:1--106:24",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505252",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505252",
  abstract =     "The surround-view module is an indispensable component
                 of a modern advanced driving assistance system. By
                 calibrating the intrinsics and extrinsics of the
                 surround-view cameras accurately, a top-down
                 surround-view can be generated from raw fisheye images.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "106",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2022:MGB,
  author =       "Quan Wang and Sheng Li and Xinpeng Zhang and Guorui
                 Feng",
  title =        "Multi-granularity Brushstrokes Network for Universal
                 Style Transfer",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "107:1--107:17",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506710",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506710",
  abstract =     "Neural style transfer has been developed in recent
                 years, where both performance and efficiency have been
                 greatly improved. However, most existing methods do not
                 transfer the brushstrokes information of style images
                 well. In this article, we address this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "107",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Saxena:2022:PSU,
  author =       "Nidhi Saxena and Balasubramanian Raman",
  title =        "Pansharpening Scheme Using Bi-dimensional Empirical
                 Mode Decomposition and Neural Network",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "108:1--108:22",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506709",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506709",
  abstract =     "The pansharpening is a combination of multispectral
                 (MS) and panchromatic (PAN) images that produce a
                 high-spatial-spectral-resolution MS images. In
                 multiresolution analysis-based pansharpening schemes,
                 some spatial and spectral distortions are found. It
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "108",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2022:EEH,
  author =       "Jingjing Wu and Jianguo Jiang and Meibin Qi and Cuiqun
                 Chen and Jingjing Zhang",
  title =        "An End-to-end Heterogeneous Restraint Network for
                 {RGB-D} Cross-modal Person Re-identification",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "109:1--109:22",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506708",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506708",
  abstract =     "The RGB-D cross-modal person re-identification (re-id)
                 task aims to identify the person of interest across the
                 RGB and depth image modes. The tremendous discrepancy
                 between these two modalities makes this task difficult
                 to tackle. Few researchers pay \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "109",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2022:SRP,
  author =       "Caixia Liu and Dehui Kong and Shaofan Wang and Jinghua
                 Li and Baocai Yin",
  title =        "A Spatial Relationship Preserving Adversarial Network
                 for {$3$D} Reconstruction from a Single Depth View",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "110:1--110:22",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506733",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506733",
  abstract =     "Recovering the geometry of an object from a single
                 depth image is an interesting yet challenging problem.
                 While previous learning based approaches have
                 demonstrated promising performance, they don't fully
                 explore spatial relationships of objects, which
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "110",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ren:2022:EES,
  author =       "Ruyong Ren and Shaozhang Niu and Hua Ren and Shubin
                 Zhang and Tengyue Han and Xiaohai Tong",
  title =        "{ESRNet}: Efficient Search and Recognition Network for
                 Image Manipulation Detection",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "111:1--111:23",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506853",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506853",
  abstract =     "With the widespread use of smartphones and the rise of
                 intelligent software, we can manipulate captured photos
                 anytime and anywhere, so the fake photos finally
                 obtained look ``Real.'' If these intelligent operation
                 methods are maliciously applied to our \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "111",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duan:2022:NMS,
  author =       "Mingxing Duan and Kenli Li and Jiayan Deng and Bin
                 Xiao and Qi Tian",
  title =        "A Novel Multi-Sample Generation Method for Adversarial
                 Attacks",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "112:1--112:21",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506852",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506852",
  abstract =     "Deep learning models are widely used in daily life,
                 which bring great convenience to our lives, but they
                 are vulnerable to attacks. How to build an attack
                 system with strong generalization ability to test the
                 robustness of deep learning systems is a hot \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "112",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2022:ATA,
  author =       "Yang Guo and Wei Gao and Siwei Ma and Ge Li",
  title =        "Accelerating Transform Algorithm Implementation for
                 Efficient Intra Coding of {8K UHD} Videos",
  journal =      j-TOMM,
  volume =       "18",
  number =       "4",
  pages =        "113:1--113:20",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3507970",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Mar 24 08:21:57 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3507970",
  abstract =     "Real-time ultra-high-definition (UHD) video
                 applications have attracted much attention, where the
                 encoder side urgently demands the high-throughput
                 two-dimensional (2D) transform hardware implementation
                 for the latest video coding standards. This article
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "113",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shao:2023:SIP,
  author =       "Xuan Shao and Ying Shen and Lin Zhang and Shengjie
                 Zhao and Dandan Zhu and Yicong Zhou",
  title =        "{SLAM} for Indoor Parking: a Comprehensive Benchmark
                 Dataset and a Tightly Coupled Semantic Framework",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3510856",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3510856",
  abstract =     "For the task of autonomous indoor parking, various
                 Visual-Inertial Simultaneous Localization And Mapping
                 (SLAM) systems are expected to achieve \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sharma:2023:WBA,
  author =       "Prasen Sharma and Ira Bisht and Arijit Sur",
  title =        "Wavelength-based Attributed Deep Neural Network for
                 Underwater Image Restoration",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3511021",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511021",
  abstract =     "Background: Underwater images, in general, suffer from
                 low contrast and high color distortions due to the
                 non-uniform attenuation of the light as it \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:SCE,
  author =       "Jie Li and Ling Han and Chong Zhang and Qiyue Li and
                 Zhi Liu",
  title =        "Spherical Convolution Empowered Viewport Prediction in
                 360 Video Multicast with Limited {FoV} Feedback",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3511603",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511603",
  abstract =     "Field of view (FoV) prediction is critical in
                 360-degree video multicast, which is a key component of
                 the emerging virtual reality and augmented reality
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Le:2023:ASN,
  author =       "Thi-Ngoc-Hanh Le and Chih-Kuo Yeh and Ying-Chi Lin and
                 Tong-Yee Lee",
  title =        "Animating Still Natural Images Using Warping",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3511894",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3511894",
  abstract =     "From a single still image, a looping video could be
                 generated by imparting subtle motion to objects in the
                 image. The results are a hybrid of photography and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiong:2023:RRD,
  author =       "Lizhi Xiong and Xiao Han and Ching-Nung Yang and
                 Zhihua Xia",
  title =        "{RDH-DES}: Reversible Data Hiding over Distributed
                 Encrypted-Image Servers Based on Secret Sharing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512797",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512797",
  abstract =     "Reversible Data Hiding in Encrypted Image (RDHEI)
                 schemes may redistribute the data hiding procedure to
                 other parties and can preserve privacy of the cover
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhen:2023:TAO,
  author =       "Peining Zhen and Shuqi Wang and Suming Zhang and
                 Xiaotao Yan and Wei Wang and Zhigang Ji and Hai-Bao
                 Chen",
  title =        "Towards Accurate Oriented Object Detection in Aerial
                 Images with Adaptive Multi-level Feature Fusion",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3513133",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3513133",
  abstract =     "Detecting objects in aerial images is a long-standing
                 and challenging problem since the objects in aerial
                 images vary dramatically in size and orientation. Most
                 existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Song:2023:DSD,
  author =       "Yue Song and Hao Tang and Nicu Sebe and Wei Wang",
  title =        "Disentangle Saliency Detection into Cascaded Detail
                 Modeling and Body Filling",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3513134",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3513134",
  abstract =     "Salient object detection has been long studied to
                 identify the most visually attractive objects in
                 images/videos. Recently, a growing amount of approaches
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:BSG,
  author =       "Yong Zhang and Yingwei Pan and Ting Yao and Rui Huang
                 and Tao Mei and Chang-Wen Chen",
  title =        "Boosting Scene Graph Generation with Visual Relation
                 Saliency",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514041",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514041",
  abstract =     "The scene graph is a symbolic data structure that
                 comprehensively describes the objects and visual
                 relations in a visual scene, while ignoring the
                 inherent perceptual \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:BVL,
  author =       "Jingwen Chen and Jianjie Luo and Yingwei Pan and Yehao
                 Li and Ting Yao and Hongyang Chao and Tao Mei",
  title =        "Boosting Vision-and-Language Navigation with Direction
                 Guiding and Backtracing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3526024",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3526024",
  abstract =     "Vision-and-Language Navigation (VLN) has been an
                 emerging and fast-developing research topic, where an
                 embodied agent is required to navigate in a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rao:2023:DPZ,
  author =       "Yunbo Rao and Ziqiang Yang and Shaoning Zeng and
                 Qifeng Wang and Jiansu Pu",
  title =        "Dual Projective Zero-Shot Learning Using Text
                 Descriptions",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514247",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514247",
  abstract =     "Zero-shot learning (ZSL) aims to recognize image
                 instances of unseen classes solely based on the
                 semantic descriptions of the unseen classes. In this
                 field, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yu:2023:MVS,
  author =       "Hang Yu and Chilam Cheang and Yanwei Fu and Xiangyang
                 Xue",
  title =        "Multi-view Shape Generation for a {$3$D} Human-like
                 Body",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514248",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514248",
  abstract =     "Three-dimensional (3D) human-like body reconstruction
                 via a single RGB image has attracted significant
                 research attention recently. Most of the existing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:WST,
  author =       "Weidong Chen and Guorong Li and Xinfeng Zhang and
                 Shuhui Wang and Liang Li and Qingming Huang",
  title =        "Weakly Supervised Text-based Actor-Action Video
                 Segmentation by Clip-level Multi-instance Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514250",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514250",
  abstract =     "In real-world scenarios, it is common that a video
                 contains multiple actors and their activities.
                 Selectively localizing one specific actor and its
                 action spatially and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shen:2023:QFC,
  author =       "Feihong Shen and Jun Liu",
  title =        "Quantum {Fourier} Convolutional Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514249",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514249",
  abstract =     "The neural network and quantum computing are both
                 significant and appealing fields, with their
                 interactive disciplines promising for large-scale
                 computing tasks \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:BBT,
  author =       "Xiaotian Wu and Peng Yao",
  title =        "{Boolean}-based Two-in-One Secret Image Sharing by
                 Adaptive Pixel Grouping",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "14:1--14:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517140",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517140",
  abstract =     "The two-in-one secret image sharing (TiOSIS) technique
                 is a hybrid scheme that protects a secret image by
                 combining visual cryptography (VCS) and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yadav:2023:DML,
  author =       "Ashima Yadav and Dinesh Kumar Vishwakarma",
  title =        "A Deep Multi-level Attentive Network for Multimodal
                 Sentiment Analysis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517139",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517139",
  abstract =     "Multimodal sentiment analysis has attracted increasing
                 attention with broad application prospects. Most of the
                 existing methods have focused on a single modality,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2023:NGA,
  author =       "Honghao Gao and Baobin Dai and Huaikou Miao and
                 Xiaoxian Yang and Ramon J. Duran Barroso and Hussain
                 Walayat",
  title =        "A Novel {GAPG} Approach to Automatic Property
                 Generation for Formal Verification: The {GAN}
                 Perspective",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517154",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517154",
  abstract =     "Formal methods have been widely used to support
                 software testing to guarantee correctness and
                 reliability. For example, model checking technology
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:LSS,
  author =       "Pengyi Zhang and Huanzhang Dou and Wenhu Zhang and
                 Yuhan Zhao and Zequn Qin and Dongping Hu and Yi Fang
                 and Xi Li",
  title =        "A Large-Scale Synthetic Gait Dataset Towards
                 in-the-Wild Simulation and Comparison Study",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "17:1--17:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517199",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517199",
  abstract =     "Gait recognition has a rapid development in recent
                 years. However, current gait recognition focuses
                 primarily on ideal laboratory scenes, leaving the gait
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2023:DAB,
  author =       "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and
                 Haifeng Hu",
  title =        "Double Attention Based on Graph Attention Network for
                 Image Multi-Label Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519030",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519030",
  abstract =     "The task of image multi-label classification is to
                 accurately recognize multiple objects in an input
                 image. Most of the recent works need to leverage the
                 label \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:ANM,
  author =       "Xianlin Zhang and Mengling Shen and Xueming Li and
                 Xiaojie Wang",
  title =        "{AABLSTM}: a Novel Multi-task Based {CNN-RNN} Deep
                 Model for Fashion Analysis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519029",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519029",
  abstract =     "With the rapid growth of online commerce and
                 fashion-related applications, visual clothing analysis
                 and recognition has become a hotspot in computer
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:GML,
  author =       "Deyin Liu and Lin (Yuanbo) Wu and Richang Hong and
                 Zongyuan Ge and Jialie Shen and Farid Boussaid and
                 Mohammed Bennamoun",
  title =        "Generative Metric Learning for Adversarially Robust
                 Open-world Person Re-Identification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3522714",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3522714",
  abstract =     "The vulnerability of re-identification (re-ID) models
                 under adversarial attacks is of significant concern as
                 criminals may use adversarial perturbations to evade
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:BHI,
  author =       "Shuo Wang and Huixia Ben and Yanbin Hao and Xiangnan
                 He and Meng Wang",
  title =        "Boosting Hyperspectral Image Classification with Dual
                 Hierarchical Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3522713",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3522713",
  abstract =     "Hyperspectral image (HSI) classification aims at
                 predicting the pixel-wise labels in an image, where
                 there are only a few labeled pixel samples (hard
                 labels) for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:DUD,
  author =       "Dayan Wu and Qi Dai and Bo Li and Weiping Wang",
  title =        "Deep Uncoupled Discrete Hashing via Similarity Matrix
                 Decomposition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524021",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524021",
  abstract =     "Hashing has been drawing increasing attention in the
                 task of large-scale image retrieval owing to its
                 storage and computation efficiency, especially
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cheung:2023:SNA,
  author =       "Ming Cheung and Weiwei Sun and James She and Jiantao
                 Zhou",
  title =        "Social Network Analytic-Based Online Counterfeit
                 Seller Detection using User Shared Images",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524135",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524135",
  abstract =     "Selling counterfeit online has become a serious
                 problem, especially with the advancement of social
                 media and mobile technology. Instead of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feihong:2023:THQ,
  author =       "Lu Feihong and Chen Hang and Li Kang and Deng Qiliang
                 and Zhao Jian and Zhang Kaipeng and Han Hong",
  title =        "Toward High-quality Face-Mask Occluded Restoration",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524137",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524137",
  abstract =     "Face-mask occluded restoration aims at restoring the
                 masked region of a human face, which has attracted
                 increasing attention in the context of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:CSL,
  author =       "Yajing Liu and Zhiwei Xiong and Ya Li and Yuning Lu
                 and Xinmei Tian and Zheng-Jun Zha",
  title =        "Category-Stitch Learning for Union Domain
                 Generalization",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524136",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524136",
  abstract =     "Domain generalization aims at generalizing the network
                 trained on multiple domains to unknown but related
                 domains. Under the assumption that different domains
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ferrari:2023:CRR,
  author =       "Claudio Ferrari and Federico Becattini and Leonardo
                 Galteri and Alberto {Del Bimbo}",
  title =        "{(Compress and Restore) N}: a Robust Defense Against
                 Adversarial Attacks on Image Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "26:1--26:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524619",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524619",
  abstract =     "Modern image classification approaches often rely on
                 deep neural networks, which have shown pronounced
                 weakness to adversarial examples: images \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Song:2023:SSC,
  author =       "Yaguang Song and Xiaoshan Yang and Changsheng Xu",
  title =        "Self-supervised Calorie-aware Heterogeneous Graph
                 Networks for Food Recommendation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "27:1--27:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524618",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524618",
  abstract =     "With the rapid development of online recipe sharing
                 platforms, food recommendation is emerging as an
                 important application. Although \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xue:2023:LEE,
  author =       "Feng Xue and Tian Yang and Kang Liu and Zikun Hong and
                 Mingwei Cao and Dan Guo and Richang Hong",
  title =        "{LCSNet}: End-to-end Lipreading with Channel-aware
                 Feature Selection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "28:1--28:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524620",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524620",
  abstract =     "Lipreading is a task of decoding the movement of the
                 speaker's lip region into text. In recent years,
                 lipreading methods based on deep neural network
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fu:2023:LPA,
  author =       "Zilong Fu and Hongtao Xie and Shancheng Fang and Yuxin
                 Wang and Mengting Xing and Yongdong Zhang",
  title =        "Learning Pixel Affinity Pyramid for Arbitrary-Shaped
                 Text Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "29:1--29:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524617",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524617",
  abstract =     "Arbitrary-shaped text detection in natural images is a
                 challenging task due to the complexity of the
                 background and the diversity of text properties. The
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{CardiaNeto:2023:LSA,
  author =       "Jo{\~a}o Baptista {Cardia Neto} and Claudio Ferrari
                 and Aparecido {Nilceu Marana} and Stefano Berretti and
                 Alberto {Del Bimbo}",
  title =        "Learning Streamed Attention Network from Descriptor
                 Images for Cross-Resolution {$3$D} Face Recognition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "30:1--30:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527158",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527158",
  abstract =     "In this article, we propose a hybrid framework for
                 cross-resolution 3D face recognition which utilizes a
                 Streamed Attention Network (SAN) that \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2023:TMM,
  author =       "Xin Huang",
  title =        "On Teaching Mode of {MTI} Translation Workshop Based
                 on {IPT} Corpus for {Tibetan} Areas of {China}",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "31:1--31:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527173",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527173",
  abstract =     "With the technological turn of applied research in
                 translation, increasing attention has been paid to the
                 teaching of translation technology. This article
                 addresses \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:MMM,
  author =       "Liming Xu and Xianhua Zeng and Weisheng Li and Bochuan
                 Zheng",
  title =        "{MFGAN}: Multi-modal Feature-fusion for {CT} Metal
                 Artifact Reduction Using {GANs}",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "32:1--32:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3528172",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3528172",
  abstract =     "Due to the existence of metallic implants in certain
                 patients, the Computed Tomography (CT) images from
                 these patients are often corrupted by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2023:DIP,
  author =       "Yuzhang Hu and Wenhan Yang and Jiaying Liu and
                 Zongming Guo",
  title =        "Deep Inter Prediction with Error-Corrected
                 Auto-Regressive Network for Video Coding",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "33:1--33:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3528173",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3528173",
  abstract =     "Modern codecs remove temporal redundancy of a video
                 via inter prediction, i.e., searching previously coded
                 frames for similar blocks and storing motion \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:IIT,
  author =       "Yue Li and Li Zhang and Kai Zhang",
  title =        "{iDAM}: Iteratively Trained Deep In-loop Filter with
                 Adaptive Model Selection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "34:1--34:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3529107",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3529107",
  abstract =     "As a rapid development of neural-network-based machine
                 learning algorithms, deep learning methods are being
                 tentatively used in a much wider range than \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jaiswal:2023:CNN,
  author =       "Rahul Kumar Jaiswal and Rajesh Kumar Dubey",
  title =        "{CAQoE}: a Novel No-Reference Context-aware Speech
                 Quality Prediction Metric",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "35:1--35:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3529394",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3529394",
  abstract =     "The quality of speech degrades while communicating
                 over Voice over Internet Protocol applications, for
                 example, Google Meet, Microsoft Skype, and Apple
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiang:2023:BPP,
  author =       "Tao Xiang and Honghong Zeng and Biwen Chen and
                 Shangwei Guo",
  title =        "{BMIF}: Privacy-preserving Blockchain-based Medical
                 Image Fusion",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "36:1--36:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531016",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531016",
  abstract =     "Medical image fusion generates a fused image
                 containing multiple features extracted from different
                 source images, and it is of great help in clinical
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:DDB,
  author =       "Xiaoke Zhu and Changlong Li and Xiaopan Chen and Xinyu
                 Zhang and Xiao-Yuan Jing",
  title =        "Distance and Direction Based Deep Discriminant Metric
                 Learning for Kinship Verification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "37:1--37:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531014",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531014",
  abstract =     "Image-based kinship verification is an important task
                 in computer vision and has many applications in
                 practice, such as missing children search and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhuang:2023:OPF,
  author =       "Weiming Zhuang and Xin Gan and Yonggang Wen and Shuai
                 Zhang",
  title =        "Optimizing Performance of Federated Person
                 Re-identification: Benchmarking and Analysis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "38:1--38:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531013",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531013",
  abstract =     "Increasingly stringent data privacy regulations limit
                 the development of person re-identification (ReID)
                 because person ReID training requires \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{DeDivitiis:2023:DFF,
  author =       "Lavinia {De Divitiis} and Federico Becattini and
                 Claudio Baecchi and Alberto {Del Bimbo}",
  title =        "Disentangling Features for Fashion Recommendation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "39:1--39:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531017",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531017",
  abstract =     "Online stores have become fundamental for the fashion
                 industry, revolving around recommendation systems to
                 suggest appropriate items to customers. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chan:2023:UFH,
  author =       "Ka-Hou Chan and Sio-Kei Im",
  title =        "Using Four Hypothesis Probability Estimators for
                 {CABAC} in Versatile Video Coding",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "40:1--40:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531015",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531015",
  abstract =     "This article introduces the key technologies involved
                 in four hypothetical probability estimators for
                 Context-based Adaptive Binary Arithmetic Coding
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2023:ATD,
  author =       "Mengqi Yuan and Bing-Kun Bao and Zhiyi Tan and
                 Changsheng Xu",
  title =        "Adaptive Text Denoising Network for Image Caption
                 Editing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "41:1--41:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532627",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532627",
  abstract =     "Image caption editing, which aims at editing the
                 inaccurate descriptions of the images, is an
                 interdisciplinary task of computer vision and natural
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:IQA,
  author =       "Xiaoyu Zhang and Wei Gao and Ge Li and Qiuping Jiang
                 and Runmin Cong",
  title =        "Image Quality Assessment-driven Reinforcement Learning
                 for Mixed Distorted Image Restoration",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "42:1--42:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532625",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532625",
  abstract =     "Due to the diversity of the degradation process that
                 is difficult to model, the recovery of mixed distorted
                 images is still a challenging problem. The deep
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bai:2023:DDI,
  author =       "Chongyang Bai and Maksim Bolonkin and Viney Regunath
                 and V. S. Subrahmanian",
  title =        "{DIPS}: a Dyadic Impression Prediction System for
                 Group Interaction Videos",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "43:1--43:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532865",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532865",
  abstract =     "We consider the problem of predicting the impression
                 that one subject has of another in a video clip showing
                 a group of interacting people. Our novel \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:SHL,
  author =       "Yuqing Liu and Xinfeng Zhang and Shanshe Wang and
                 Siwei Ma and Wen Gao",
  title =        "Sequential Hierarchical Learning with Distribution
                 Transformation for Image Super-Resolution",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "44:1--44:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532864",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532864",
  abstract =     "Multi-scale design has been considered in recent image
                 super-resolution (SR) works to explore the hierarchical
                 feature information. Existing multi-scale \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:JJD,
  author =       "Haidong Wang and Xuan He and Zhiyong Li and Jin Yuan
                 and Shutao Li",
  title =        "{JDAN}: Joint Detection and Association Network for
                 Real-Time Online Multi-Object Tracking",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "45:1--45:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3533253",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3533253",
  abstract =     "In the last few years, enormous strides have been made
                 for object detection and data association, which are
                 vital subtasks for one-stage online multi-object
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiao:2023:NRD,
  author =       "Mengyao Xiao and Xiaolong Li and Yao Zhao and Bin Ma
                 and Guodong Guo",
  title =        "A Novel Reversible Data Hiding Scheme Based on
                 Pixel-Residual Histogram",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "46:1--46:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3534565",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3534565",
  abstract =     "Prediction-error expansion (PEE) is the most popular
                 reversible data hiding (RDH) technique due to its
                 efficient capacity-distortion tradeoff. With the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:MGF,
  author =       "Jiazhi Liu and Feng Liu",
  title =        "Modified {$2$D}-Ghost-Free Stereoscopic Display with
                 Depth-of-Field Effects",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "47:1--47:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3534964",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3534964",
  abstract =     "Backward-compatible stereoscopic display, a novel
                 display technique that can simultaneously present
                 satisfying 3D effects to viewers with stereo \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:RAC,
  author =       "Jingwen Chen and Yingwei Pan and Yehao Li and Ting Yao
                 and Hongyang Chao and Tao Mei",
  title =        "Retrieval Augmented Convolutional Encoder-decoder
                 Networks for Video Captioning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "48:1--48:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3539225",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3539225",
  abstract =     "Video captioning has been an emerging research topic
                 in computer vision, which aims to generate a natural
                 sentence to correctly reflect the visual \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:CSA,
  author =       "Guanyu Zhu and Yong Zhou and Rui Yao and Hancheng Zhu
                 and Jiaqi Zhao",
  title =        "Cyclic Self-attention for Point Cloud Recognition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "49:1--49:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3538648",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3538648",
  abstract =     "Point clouds provide a flexible geometric
                 representation for computer vision research. However,
                 the harsh demands for the number of input points and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:EMF,
  author =       "Dinghao Yang and Wei Gao and Ge Li and Hui Yuan and
                 Junhui Hou and Sam Kwong",
  title =        "Exploiting Manifold Feature Representation for
                 Efficient Classification of {$3$D} Point Clouds",
  journal =      j-TOMM,
  volume =       "19",
  number =       "1s",
  pages =        "50:1--50:??",
  month =        feb,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3539611",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:33 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3539611",
  abstract =     "In this paper, we propose an efficient point cloud
                 classification method via manifold learning based
                 feature representation. Different from conventional
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lan:2023:STS,
  author =       "Xiaohan Lan and Yitian Yuan and Xin Wang and Zhi Wang
                 and Wenwu Zhu",
  title =        "A Survey on Temporal Sentence Grounding in Videos",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "51:1--51:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3532626",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3532626",
  abstract =     "Temporal sentence grounding in videos (TSGV), which
                 aims at localizing one target segment from an untrimmed
                 video with respect to a given sentence \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qiao:2023:HPI,
  author =       "Yu Qiao and Yuhao Liu and Ziqi Wei and Yuxin Wang and
                 Qiang Cai and Guofeng Zhang and Xin Yang",
  title =        "Hierarchical and Progressive Image Matting",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "52:1--52:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3540201",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3540201",
  abstract =     "Most matting research resorts to advanced semantics to
                 achieve high-quality alpha mattes, and a direct
                 low-level features combination is usually explored to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Peng:2023:LDS,
  author =       "Fei Peng and Wenyan Jiang and Min Long",
  title =        "A Low Distortion and Steganalysis-resistant Reversible
                 Data Hiding for {$2$D} Engineering Graphics",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "53:1--53:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3539661",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3539661",
  abstract =     "To reduce the distortion resulting from the large
                 number of crossing quantization cells and resist
                 steganalysis, a reversible data hiding scheme for 2D
                 engineering \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mai:2023:MGU,
  author =       "Sijie Mai and Songlong Xing and Jiaxuan He and Ying
                 Zeng and Haifeng Hu",
  title =        "Multimodal Graph for Unaligned Multimodal Sequence
                 Analysis via Graph Convolution and Graph Pooling",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "54:1--54:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3542927",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3542927",
  abstract =     "Multimodal sequence analysis aims to draw inferences
                 from visual, language, and acoustic sequences. A
                 majority of existing works focus on the aligned fusion
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2023:PLN,
  author =       "Qi Zheng and Jianfeng Dong and Xiaoye Qu and Xun Yang
                 and Yabing Wang and Pan Zhou and Baolong Liu and Xun
                 Wang",
  title =        "Progressive Localization Networks for Language-Based
                 Moment Localization",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "55:1--55:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543857",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543857",
  abstract =     "This article targets the task of language-based video
                 moment localization. The language-based setting of this
                 task allows for an open set of target activities,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:LCE,
  author =       "Yue Zhang and Fanghui Zhang and Yi Jin and Yigang Cen
                 and Viacheslav Voronin and Shaohua Wan",
  title =        "Local Correlation Ensemble with {GCN} Based on
                 Attention Features for Cross-domain Person Re-{ID}",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "56:1--56:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3542820",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3542820",
  abstract =     "Person re-identification (Re-ID) has achieved great
                 success in single-domain. However, it remains a
                 challenging task to adapt a Re-ID model trained on
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chakareski:2023:MWF,
  author =       "Jacob Chakareski and Mahmudur Khan and Tanguy
                 Ropitault and Steve Blandino",
  title =        "Millimeter Wave and Free-space-optics for Future
                 Dual-connectivity {6DOF} Mobile Multi-user {VR}
                 Streaming",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "57:1--57:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3544494",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3544494",
  abstract =     "Dual-connectivity streaming is a key enabler of
                 next-generation six Degrees Of Freedom (6DOF) Virtual
                 Reality (VR) scene immersion. Indeed, using
                 conventional sub-6 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2023:IPG,
  author =       "Yun-Shao Lin and Yi-Ching Liu and Chi-Chun Lee",
  title =        "An Interaction-process-guided Framework for
                 Small-group Performance Prediction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "58:1--58:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3558768",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3558768",
  abstract =     "A small group is a fundamental interaction unit for
                 achieving a shared goal. Group performance can be
                 automatically predicted using computational methods to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2023:EEA,
  author =       "Na Zheng and Xuemeng Song and Tianyu Su and Weifeng
                 Liu and Yan Yan and Liqiang Nie",
  title =        "Egocentric Early Action Prediction via Adversarial
                 Knowledge Distillation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "59:1--59:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3544493",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3544493",
  abstract =     "Egocentric early action prediction aims to recognize
                 actions from the first-person view by only observing a
                 partial video segment, which is challenging due to the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:ISR,
  author =       "Li Wang and Ke Li and Jingjing Tang and Yuying Liang",
  title =        "Image Super-Resolution via Lightweight
                 Attention-Directed Feature Aggregation Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "60:1--60:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546076",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546076",
  abstract =     "The advent of convolutional neural networks (CNNs) has
                 brought substantial progress in image super-resolution
                 (SR) reconstruction. However, most SR methods pursue
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lin:2023:FAC,
  author =       "Jiaying Lin and Xin Tan and Ke Xu and Lizhuang Ma and
                 Rynson W. H. Lau",
  title =        "Frequency-aware Camouflaged Object Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "61:1--61:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3545609",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3545609",
  abstract =     "Camouflaged object detection (COD) is important as it
                 has various potential applications. Unlike salient
                 object detection (SOD), which tries to identify
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2023:HNR,
  author =       "Shuang Liang and Anjie Zhu and Jiasheng Zhang and Jie
                 Shao",
  title =        "Hyper-node Relational Graph Attention Network for
                 Multi-modal Knowledge Graph Completion",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "62:1--62:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3545573",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3545573",
  abstract =     "Knowledge graphs often suffer from incompleteness, and
                 knowledge graph completion (KGC) aims at inferring the
                 missing triplets through \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2023:LVT,
  author =       "Yaya Shi and Haiyang Xu and Chunfeng Yuan and Bing Li
                 and Weiming Hu and Zheng-Jun Zha",
  title =        "Learning Video-Text Aligned Representations for Video
                 Captioning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "63:1--63:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546828",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546828",
  abstract =     "Video captioning requires that the model has the
                 abilities of video understanding, video-text alignment,
                 and text generation. Due to the semantic gap between
                 vision \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:NRQ,
  author =       "Yang Yang and Yingqiu Ding and Ming Cheng and Weiming
                 Zhang",
  title =        "No-reference Quality Assessment for Contrast-distorted
                 Images Based on Gray and Color-gray-difference Space",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "64:1--64:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555355",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555355",
  abstract =     "No-reference image quality assessment is a basic and
                 challenging problem in the field of image processing.
                 Among them, contrast distortion has a great impact on
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:REC,
  author =       "Jia Wang and Jingcheng Ke and Hong-Han Shuai and
                 Yung-Hui Li and Wen-Huang Cheng",
  title =        "Referring Expression Comprehension Via Enhanced
                 Cross-modal Graph Attention Networks",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "65:1--65:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548688",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548688",
  abstract =     "Referring expression comprehension aims to localize a
                 specific object in an image according to a given
                 language description. It is still challenging to
                 comprehend \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:BLL,
  author =       "Dengyong Zhang and Pu Huang and Xiangling Ding and
                 Feng Li and Wenjie Zhu and Yun Song and Gaobo Yang",
  title =        "{L$^2$BEC$^2$}: Local Lightweight Bidirectional
                 Encoding and Channel Attention Cascade for Video Frame
                 Interpolation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "66:1--66:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547660",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547660",
  abstract =     "Video frame interpolation (VFI) is of great importance
                 for many video applications, yet it is still
                 challenging even in the era of deep learning. Some
                 existing VFI models \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:PBI,
  author =       "Yushu Zhang and Qing Tan and Shuren Qi and Mingfu
                 Xue",
  title =        "{PRNU}-based Image Forgery Localization with Deep
                 Multi-scale Fusion",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "67:1--67:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548689",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548689",
  abstract =     "Photo-response non-uniformity (PRNU), as a class of
                 device fingerprint, plays a key role in the forgery
                 detection/localization for visual media. The \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dong:2023:SEG,
  author =       "Shanshan Dong and Tianzi Niu and Xin Luo and Wu Liu
                 and Xinshun Xu",
  title =        "Semantic Embedding Guided Attention with Explicit
                 Visual Feature Fusion for Video Captioning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "68:1--68:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550276",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550276",
  abstract =     "Video captioning, which bridges vision and language,
                 is a fundamental yet challenging task in computer
                 vision. To generate accurate and comprehensive
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:SBS,
  author =       "Shunxin Xu and Ke Sun and Dong Liu and Zhiwei Xiong
                 and Zheng-Jun Zha",
  title =        "Synergy between Semantic Segmentation and Image
                 Denoising via Alternate Boosting",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "69:1--69:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548459",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548459",
  abstract =     "The capability of image semantic segmentation may be
                 deteriorated due to the noisy input image, where image
                 denoising prior to segmentation \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Song:2023:SSI,
  author =       "Dan Song and Chu-Meng Zhang and Xiao-Qian Zhao and
                 Teng Wang and Wei-Zhi Nie and Xuan-Ya Li and An-An
                 Liu",
  title =        "Self-supervised Image-based {$3$D} Model Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "70:1--70:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548690",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548690",
  abstract =     "Image-based 3D model retrieval aims at organizing
                 unlabeled 3D models according to the relevance to the
                 labeled 2D images. With easy accessibility of 2D images
                 and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nousias:2023:DSM,
  author =       "Stavros Nousias and Gerasimos Arvanitis and Aris Lalos
                 and Konstantinos Moustakas",
  title =        "Deep Saliency Mapping for {$3$D} Meshes and
                 Applications",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "71:1--71:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550073",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550073",
  abstract =     "Nowadays, three-dimensional (3D) meshes are widely
                 used in various applications in different areas (e.g.,
                 industry, education, entertainment and safety). The 3D
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:TNR,
  author =       "Yun Liu and Xiaohua Yin and Zuliang Wan and Guanghui
                 Yue and Zhi Zheng",
  title =        "Toward A No-reference Omnidirectional Image Quality
                 Evaluation by Using Multi-perceptual Features",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "72:1--72:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3549544",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3549544",
  abstract =     "Compared to ordinary images, omnidirectional image
                 (OI) usually has a broader view and a higher
                 resolution, and image quality assessment (IQA)
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:RIE,
  author =       "Hua Wu and Xin Li and Gang Wang and Guang Cheng and
                 Xiaoyan Hu",
  title =        "Resolution Identification of Encrypted Video Streaming
                 Based on {HTTP/2} Features",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "73:1--73:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3551891",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3551891",
  abstract =     "With the inevitable dominance of video traffic on the
                 Internet, Internet service providers (ISP) are striving
                 to deliver video streaming with high quality. Video
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qin:2023:QEC,
  author =       "Qipu Qin and Cheolkon Jung",
  title =        "Quality Enhancement of Compressed $ 360$-Degree Videos
                 Using Viewport-based Deep Neural Networks",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "74:1--74:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3551641",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3551641",
  abstract =     "360-degree video provides omnidirectional views by a
                 bounding sphere, thus also called omnidirectional
                 video. For omnidirectional video, people can only see
                 specific \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2023:AIS,
  author =       "Wei Zhou and Zhiwu Xia and Peng Dou and Tao Su and
                 Haifeng Hu",
  title =        "Aligning Image Semantics and Label Concepts for Image
                 Multi-Label Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2",
  pages =        "75:1--75:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550278",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:34 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550278",
  abstract =     "Image multi-label classification task is mainly to
                 correctly predict multiple object categories in the
                 images. To capture the correlation between labels,
                 graph \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jabeen:2023:RMA,
  author =       "Summaira Jabeen and Xi Li and Muhammad Shoib Amin and
                 Omar Bourahla and Songyuan Li and Abdul Jabbar",
  title =        "A Review on Methods and Applications in Multimodal
                 Deep Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "76:1--76:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3545572",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3545572",
  abstract =     "Deep Learning has implemented a wide range of
                 applications and has become increasingly popular in
                 recent years. The goal of multimodal deep learning
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2023:IRG,
  author =       "Sophie C. C. Sun and Yongkang Zhao and Fang-Wei Fu and
                 Yawei Ren",
  title =        "Improved Random Grid-based Cheating Prevention Visual
                 Cryptography Using Latin Square",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "77:1--77:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550275",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550275",
  abstract =     "Visual cryptography scheme is a method of encrypting
                 secret image into n noiselike shares. The secret image
                 can be reconstructed by stacking \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dong:2023:VFI,
  author =       "Jiong Dong and Kaoru Ota and Mianxiong Dong",
  title =        "Video Frame Interpolation: a Comprehensive Survey",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "78:1--78:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3556544",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3556544",
  abstract =     "Video Frame Interpolation (VFI) is a fascinating and
                 challenging problem in the computer vision (CV) field,
                 aiming to generate non-existing frames \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cao:2023:DKP,
  author =       "Gaofeng Cao and Fei Zhou and Kanglin Liu and Anjie
                 Wang and Leidong Fan",
  title =        "A Decoupled Kernel Prediction Network Guided by Soft
                 Mask for Single Image {HDR} Reconstruction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "79:1--79:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550277",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550277",
  abstract =     "Recent works on single image high dynamic range (HDR)
                 reconstruction fail to hallucinate plausible textures,
                 resulting in information missing and artifacts
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:PCQ,
  author =       "Yipeng Liu and Qi Yang and Yiling Xu and Le Yang",
  title =        "Point Cloud Quality Assessment: Dataset Construction
                 and Learning-based No-reference Metric",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "80:1--80:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550274",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550274",
  abstract =     "Full-reference (FR) point cloud quality assessment
                 (PCQA) has achieved impressive progress in recent
                 years. However, in many cases, obtaining the reference
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:PAC,
  author =       "Cheng Xu and Zejun Chen and Jiajie Mai and Xuemiao Xu
                 and Shengfeng He",
  title =        "Pose- and Attribute-consistent Person Image
                 Synthesis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "81:1--81:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3554739",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3554739",
  abstract =     "Person Image Synthesis aims at transferring the
                 appearance of the source person image into a target
                 pose. Existing methods cannot handle large pose
                 variations and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Park:2023:SCQ,
  author =       "Jae Hyun Park and Sanghoon Kim and Joo Chan Lee and
                 Jong Hwan Ko",
  title =        "Scalable Color Quantization for Task-centric Image
                 Compression",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "82:1--82:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3551389",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3551389",
  abstract =     "Conventional image compression techniques targeted for
                 the perceptual quality are not generally optimized for
                 classification tasks using deep neural networks
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Puig:2023:FFP,
  author =       "Joan Manuel Marqu{\`e}s Puig and Helena Rif{\`a}-Pous
                 and Samia Oukemeni",
  title =        "From False-Free to Privacy-Oriented Communitarian
                 Microblogging Social Networks",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "83:1--83:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555354",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555354",
  abstract =     "Online Social Networks (OSNs) have gained enormous
                 popularity in recent years. They provide a dynamic
                 platform for sharing content (text messages or \ldots{}
                 ) \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2023:QGP,
  author =       "Yiming Tang and Yi Yu",
  title =        "Query-Guided Prototype Learning with Decoder Alignment
                 and Dynamic Fusion in Few-Shot Segmentation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "84:1--84:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555314",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555314",
  abstract =     "Few-shot segmentation aims to segment objects
                 belonging to a specific class under the guidance of a
                 few annotated examples. Most existing approaches follow
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:MCM,
  author =       "Zhiming Liu and Kai Niu and Zhiqiang He",
  title =        "{ML-CookGAN}: Multi-Label Generative Adversarial
                 Network for Food Image Generation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "85:1--85:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3554738",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3554738",
  abstract =     "Generating food images from recipe and ingredient
                 information can be applied to many tasks such as food
                 recommendation, recipe development, and health
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Alwaely:2023:GGB,
  author =       "Basheer Alwaely and Charith Abhayaratne",
  title =        "{GHOSM}: Graph-based Hybrid Outline and Skeleton
                 Modelling for Shape Recognition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "86:1--86:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3554922",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3554922",
  abstract =     "An efficient and accurate shape detection model plays
                 a major role in many research areas. With the emergence
                 of more complex shapes in real-life applications,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jonna:2023:DDK,
  author =       "Sankaraganesh Jonna and Moushumi Medhi and Rajiv
                 Ranjan Sahay",
  title =        "{Distill-DBDGAN}: Knowledge Distillation and
                 Adversarial Learning Framework for Defocus Blur
                 Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "87:1--87:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3557897",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3557897",
  abstract =     "Defocus blur detection (DBD) aims to segment the
                 blurred regions from a given image affected by defocus
                 blur. It is a crucial pre-processing step for various
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ding:2023:BRD,
  author =       "Xuewei Ding and Yingwei Pan and Yehao Li and Ting Yao
                 and Dan Zeng and Tao Mei",
  title =        "Boosting Relationship Detection in Images with
                 Multi-Granular Self-Supervised Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "88:1--88:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3556978",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3556978",
  abstract =     "Visual and spatial relationship detection in images
                 has been a fast-developing research topic in the
                 multimedia field, which learns to recognize the
                 semantic/spatial \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chu:2023:RLT,
  author =       "Binfei Chu and Yiting Lin and Bineng Zhong and Zhenjun
                 Tang and Xianxian Li and Jing Wang",
  title =        "Robust Long-Term Tracking via Localizing Occluders",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "89:1--89:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3557896",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3557896",
  abstract =     "Occlusion is known as one of the most challenging
                 factors in long-term tracking because of its
                 unpredictable shape. Existing works devoted into the
                 design of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:CPG,
  author =       "Huisi Wu and Zhaoze Wang and Zhuoying Li and Zhenkun
                 Wen and Jing Qin",
  title =        "Context Prior Guided Semantic Modeling for Biomedical
                 Image Segmentation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "90:1--90:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3558520",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3558520",
  abstract =     "Most state-of-the-art deep networks proposed for
                 biomedical image segmentation are developed based on
                 U-Net. While remarkable success has been \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:OBM,
  author =       "Jun Wu and Tianliang Zhu and Jiahui Zhu and Tianyi Li
                 and Chunzhi Wang",
  title =        "A Optimized {BERT} for Multimodal Sentiment Analysis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "91:1--91:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566126",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566126",
  abstract =     "Sentiment analysis of one modality (e.g., text or
                 image) has been broadly studied. However, not much
                 attention has been paid to the sentiment analysis of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:PTM,
  author =       "Yongzong Xu and Zhijing Yang and Tianshui Chen and Kai
                 Li and Chunmei Qing",
  title =        "Progressive Transformer Machine for Natural Character
                 Reenactment",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "92:1--92:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3559107",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3559107",
  abstract =     "Character reenactment aims to control a target
                 person's full-head movement by a driving monocular
                 sequence that is made up of the driving character
                 video. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tan:2023:IVV,
  author =       "Chong Hong Tan and Koksheik Wong and Vishnu Monn
                 Baskaran and Kiki Adhinugraha and David Taniar",
  title =        "Is it Violin or {Viola}? {Classifying} the
                 Instruments' Music Pieces using Descriptive
                 Statistics",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "93:1--93:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563218",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563218",
  abstract =     "Classifying music pieces based on their instrument
                 sounds is pivotal for analysis and application
                 purposes. Given its importance, techniques using
                 machine learning \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Singh:2023:ESM,
  author =       "KN Singh and OP Singh and Amit Kumar Singh and Amrit
                 Kumar Agrawal",
  title =        "{EiMOL}: a Secure Medical Image Encryption Algorithm
                 based on Optimization and the {Lorenz} System",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "94:1--94:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561513",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561513",
  abstract =     "Nowadays, the demand for digital images from different
                 intelligent devices and sensors has dramatically
                 increased in smart healthcare. Due to advanced
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qiao:2023:UUE,
  author =       "Ziteng Qiao and Dianxi Shi and Xiaodong Yi and Yanyan
                 Shi and Yuhui Zhang and Yangyang Liu",
  title =        "{UEFPN}: Unified and Enhanced Feature Pyramid Networks
                 for Small Object Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "95:1--95:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561824",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561824",
  abstract =     "Object detection models based on feature pyramid
                 networks have made significant progress in general
                 object detection. However, small object detection is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:DLB,
  author =       "Linwei Zhu and Yun Zhang and Na Li and Gangyi Jiang
                 and Sam Kwong",
  title =        "Deep Learning-Based Intra Mode Derivation for
                 Versatile Video Coding",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "96:1--96:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563699",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563699",
  abstract =     "In intra coding, Rate Distortion Optimization (RDO) is
                 performed to achieve the optimal intra mode from a
                 pre-defined candidate list. The optimal intra mode is
                 also \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2023:LEI,
  author =       "Donghuo Zeng and Jianming Wu and Gen Hattori and Rong
                 Xu and Yi Yu",
  title =        "Learning Explicit and Implicit Dual Common Subspaces
                 for Audio-visual Cross-modal Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "97:1--97:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3564608",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3564608",
  abstract =     "Audio-visual tracks in video contain rich semantic
                 information with potential in many applications and
                 research. Since the audio-visual data have inconsistent
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2023:RTI,
  author =       "Qiqi Gao and Jie Li and Tiejun Zhao and Yadong Wang",
  title =        "Real-time Image Enhancement with Attention
                 Aggregation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "98:1--98:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3564607",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3564607",
  abstract =     "Image enhancement has stimulated significant research
                 works over the past years for its great application
                 potential in video conferencing scenarios. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:TVB,
  author =       "Yucheng Zhu and Xiongkuo Min and Dandan Zhu and
                 Guangtao Zhai and Xiaokang Yang and Wenjun Zhang and Ke
                 Gu and Jiantao Zhou",
  title =        "Toward Visual Behavior and Attention Understanding for
                 Augmented 360 Degree Videos",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "99:1--99:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565024",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565024",
  abstract =     "Augmented reality (AR) overlays digital content onto
                 reality. In an AR system, correct and precise
                 estimations of user visual fixations and head movements
                 can \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mei:2023:MSS,
  author =       "Haiyang Mei and Letian Yu and Ke Xu and Yang Wang and
                 Xin Yang and Xiaopeng Wei and Rynson W. H. Lau",
  title =        "Mirror Segmentation via Semantic-aware Contextual
                 Contrasted Feature Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "2s",
  pages =        "100:1--100:??",
  month =        apr,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566127",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:35 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566127",
  abstract =     "Mirrors are everywhere in our daily lives. Existing
                 computer vision systems do not consider mirrors, and
                 hence may get confused by the reflected content
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:PSN,
  author =       "Yi Zhang and Fang-Yi Chao and Wassim Hamidouche and
                 Olivier Deforges",
  title =        "{PAV-SOD}: a New Task towards Panoramic Audiovisual
                 Saliency Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "101:1--101:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565267",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565267",
  abstract =     "Object-level audiovisual saliency detection in
                 360${}^\circ $ panoramic real-life dynamic scenes is
                 important for exploring and modeling human perception
                 in immersive \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xie:2023:TDW,
  author =       "Chi Xie and Zikun Zhuang and Shengjie Zhao and Shuang
                 Liang",
  title =        "Temporal Dropout for Weakly Supervised Action
                 Localization",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "102:1--102:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567827",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567827",
  abstract =     "Weakly supervised action localization is a challenging
                 problem in video understanding and action recognition.
                 Existing models usually formulate the training
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2023:MBR,
  author =       "Yangyang Guo and Liqiang Nie and Harry Cheng and
                 Zhiyong Cheng and Mohan Kankanhalli and Alberto {Del
                 Bimbo}",
  title =        "On Modality Bias Recognition and Reduction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "103:1--103:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565266",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565266",
  abstract =     "Making each modality in multi-modal data contribute is
                 of vital importance to learning a versatile multi-modal
                 model. Existing methods, however, are often \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:CTC,
  author =       "Kang Xu and Weixin Li and Xia Wang and Xiaoyan Hu and
                 Ke Yan and Xiaojie Wang and Xuan Dong",
  title =        "{CUR} Transformer: a Convolutional Unbiased Regional
                 Transformer for Image Denoising",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "104:1--104:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566125",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566125",
  abstract =     "Image denoising is a fundamental problem in computer
                 vision and multimedia computation. Non-local filters
                 are effective for image denoising. But existing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "104",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2023:BPL,
  author =       "Wenxin Huang and Xuemei Jia and Xian Zhong and Xiao
                 Wang and Kui Jiang and Zheng Wang",
  title =        "Beyond the Parts: Learning Coarse-to-Fine Adaptive
                 Alignment Representation for Person Search",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "105:1--105:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565886",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565886",
  abstract =     "Person search is a time-consuming computer vision task
                 that entails locating and recognizing query people in
                 scenic pictures. Body components are \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "105",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yu:2023:DAP,
  author =       "Hongchuan Yu and Mengqing Huang and Jian Jun Zhang",
  title =        "Domain Adaptation Problem in Sketch Based Image
                 Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "106:1--106:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565368",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565368",
  abstract =     "In this article, we present two algorithms that
                 discover the discriminative structures of sketches,
                 given pairs of sketches and photos in sketch-based
                 image retrieval \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "106",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2023:TIF,
  author =       "Han Yan and Haijun Zhang and Jianyang Shi and
                 Jianghong Ma and Xiaofei Xu",
  title =        "Toward Intelligent Fashion Design: a Texture and Shape
                 Disentangled Generative Adversarial Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "107:1--107:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567596",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567596",
  abstract =     "Texture and shape in fashion, constituting essential
                 elements of garments, characterize the body and surface
                 of the fabric and outline the silhouette of clothing,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "107",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dou:2023:MTP,
  author =       "Peng Dou and Ying Zeng and Zhuoqun Wang and Haifeng
                 Hu",
  title =        "Multiple Temporal Pooling Mechanisms for Weakly
                 Supervised Temporal Action Localization",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "108:1--108:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567828",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567828",
  abstract =     "Recent action localization works learn in a weakly
                 supervised manner to avoid the expensive cost of human
                 labeling. Those works are mostly based on the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "108",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:MSE,
  author =       "Lei Li and Zhiyuan Zhou and Suping Wu and Yongrong
                 Cao",
  title =        "Multi-scale Edge-guided Learning for {$3$D}
                 Reconstruction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "109:1--109:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3568678",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3568678",
  abstract =     "Single-view three-dimensional (3D) object
                 reconstruction has always been a long-term challenging
                 task. Objects with complex topologies are hard to
                 accurately \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "109",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:LFR,
  author =       "Zhengxue Wang and Guangwei Gao and Juncheng Li and Hui
                 Yan and Hao Zheng and Huimin Lu",
  title =        "Lightweight Feature De-redundancy and Self-calibration
                 Network for Efficient Image Super-resolution",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "110:1--110:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569900",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569900",
  abstract =     "In recent years, thanks to the inherent powerful
                 feature representation and learning abilities of the
                 convolutional neural network (CNN), deep CNN-steered
                 single \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "110",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2023:FTF,
  author =       "Zhijie Huang and Jun Sun and Xiaopeng Guo",
  title =        "{FastCNN}: Towards Fast and Accurate Spatiotemporal
                 Network for {HEVC} Compressed Video Enhancement",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "111:1--111:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569583",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569583",
  abstract =     "Deep neural networks have achieved remarkable success
                 in HEVC compressed video quality enhancement. However,
                 most existing multiframe-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "111",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:DPS,
  author =       "Xiaohan Wang and Linchao Zhu and Fei Wu and Yi Yang",
  title =        "A Differentiable Parallel Sampler for Efficient Video
                 Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "112:1--112:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569584",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569584",
  abstract =     "It is crucial to sample a small portion of relevant
                 frames for efficient video classification. The existing
                 methods mainly develop hand-designed sampling
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "112",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:TFE,
  author =       "Junjie Li and Jin Yuan and Zhiyong Li",
  title =        "{TP-FER}: an Effective Three-phase Noise-tolerant
                 Recognizer for Facial Expression Recognition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "113:1--113:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570329",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570329",
  abstract =     "Single-label facial expression recognition (FER),
                 which aims to classify single expression for facial
                 images, usually suffers from the label noisy and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "113",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2023:LEF,
  author =       "Baojin Huang and Zhongyuan Wang and Guangcheng Wang
                 and Zhen Han and Kui Jiang",
  title =        "Local Eyebrow Feature Attention Network for Masked
                 Face Recognition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "114:1--114:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569943",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569943",
  abstract =     "During the COVID-19 coronavirus epidemic, wearing
                 masks has become increasingly popular. Traditional
                 occlusion face recognition algorithms are almost
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "114",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:ESI,
  author =       "Bin-Cheng Yang and Gangshan Wu",
  title =        "Efficient Single-image Super-resolution Using Dual
                 path Connections with Multiple scale Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "115:1--115:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570164",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570164",
  abstract =     "Deep convolutional neural networks have been
                 demonstrated to be effective for single-image
                 super-resolution in recent years. On the one hand,
                 residual \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "115",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2023:AAM,
  author =       "Wei Zhou and Yanke Hou and Dihu Chen and Haifeng Hu
                 and Tao Su",
  title =        "Attention-Augmented Memory Network for Image
                 Multi-Label Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "116:1--116:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570166",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570166",
  abstract =     "The purpose of image multi-label classification is to
                 predict all the object categories presented in an
                 image. Some recent works exploit graph \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "116",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hui:2023:MGC,
  author =       "Shuaixiong Hui and Qiang Guo and Xiaoyu Geng and
                 Caiming Zhang",
  title =        "Multi-Guidance {CNNs} for Salient Object Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "117:1--117:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570507",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570507",
  abstract =     "Feature refinement and feature fusion are two key
                 steps in convolutional neural networks-based salient
                 object detection (SOD). In this article, we \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xing:2023:PPI,
  author =       "Kai Xing and Tao Li and Xuanhan Wang",
  title =        "{ProposalVLAD} with Proposal-Intra Exploring for
                 Temporal Action Proposal Generation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "118:1--118:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571747",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571747",
  abstract =     "Temporal action proposal generation aims to localize
                 temporal segments of human activities in videos.
                 Current boundary-based proposal generation methods can
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "118",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2023:DUK,
  author =       "Hao Tang and Lei Ding and Songsong Wu and Bin Ren and
                 Nicu Sebe and Paolo Rota",
  title =        "Deep Unsupervised Key Frame Extraction for Efficient
                 Video Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "119:1--119:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571735",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571735",
  abstract =     "Video processing and analysis have become an urgent
                 task, as a huge amount of videos (e.g., YouTube, Hulu)
                 are uploaded online every day. The extraction of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "119",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:ERI,
  author =       "Ling Zhang and Chengjiang Long and Xiaolong Zhang and
                 Chunxia Xiao",
  title =        "Exploiting Residual and Illumination with {GANs} for
                 Shadow Detection and Shadow Removal",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "120:1--120:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571745",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571745",
  abstract =     "Residual image and illumination estimation have been
                 proven to be helpful for image enhancement. In this
                 article, we propose a general framework, called RI-GAN,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "120",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:DRI,
  author =       "Yushu Zhang and Nuo Chen and Shuren Qi and Mingfu Xue
                 and Zhongyun Hua",
  title =        "Detection of Recolored Image by Texture Features in
                 Chrominance Components",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "121:1--121:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571076",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571076",
  abstract =     "Image recoloring is an emerging editing technique that
                 can change the color style of an image by modifying
                 pixel values without altering the original image
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "121",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xue:2023:HFF,
  author =       "Han Xue and Jun Ling and Anni Tang and Li Song and
                 Rong Xie and Wenjun Zhang",
  title =        "High-Fidelity Face Reenactment Via Identity-Matched
                 Correspondence Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "122:1--122:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571857",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571857",
  abstract =     "Face reenactment aims to generate an animation of a
                 source face using the poses and expressions from a
                 target face. Although recent methods have made
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "122",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:PHD,
  author =       "Haozhe Chen and Hang Zhou and Jie Zhang and Dongdong
                 Chen and Weiming Zhang and Kejiang Chen and Gang Hua
                 and Nenghai Yu",
  title =        "Perceptual Hashing of Deep Convolutional Neural
                 Networks for Model Copy Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "123:1--123:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572777",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572777",
  abstract =     "In recent years, many model intellectual property (IP)
                 proof methods for IP protection have been proposed,
                 such as model watermarking and model \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "123",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Duan:2023:MGL,
  author =       "Wei Duan and Yi Yu and Xulong Zhang and Suhua Tang and
                 Wei Li and Keizo Oyama",
  title =        "Melody Generation from Lyrics with Local
                 Interpretability",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "124:1--124:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572031",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572031",
  abstract =     "Melody generation aims to learn the distribution of
                 real melodies to generate new melodies conditioned on
                 lyrics, which has been a very interesting topic in the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "124",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:TFG,
  author =       "Shiguang Liu and Huixin Wang",
  title =        "Talking Face Generation via Facial Anatomy",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3",
  pages =        "125:1--125:??",
  month =        may,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571746",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571746",
  abstract =     "To generate the corresponding talking face from a
                 speech audio and a face image, it is essential to match
                 the variations in the facial appearance with the speech
                 audio \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "125",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2023:TIA,
  author =       "Zengri Zeng and Baokang Zhao and Han-Chieh Chao and
                 Ilsun You and Kuo-Hui Yeh and Weizhi Meng",
  title =        "Towards Intelligent Attack Detection Using {DNA}
                 Computing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "126:1--126:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561057",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561057",
  abstract =     "In recent years, frequent network attacks have
                 seriously threatened the interests and security of
                 humankind. To address this threat, many detection
                 methods \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "126",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:DCB,
  author =       "Jinxia Wang and Rui Chen and Zhihan Lv",
  title =        "{DNA} Computing-Based Multi-Source Data Storage Model
                 in Digital Twins",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "127:1--127:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561823",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561823",
  abstract =     "The work aims to study the application of
                 Deoxyribonucleic Acid (DNA) multi-source data storage
                 in Digital Twins (DT). Through the investigation of the
                 research \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ahmed:2023:DBC,
  author =       "Fawad Ahmed and Muneeb Ur Rehman and Jawad Ahmad and
                 Muhammad Shahbaz Khan and Wadii Boulila and Gautam
                 Srivastava and Jerry Chun-Wei Lin and William J.
                 Buchanan",
  title =        "A {DNA} Based Colour Image Encryption Scheme Using A
                 Convolutional Autoencoder",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "128:1--128:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570165",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570165",
  abstract =     "With the advancement in technology, digital images can
                 easily be transmitted and stored over the Internet.
                 Encryption is used to avoid illegal interception of
                 digital \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "128",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Menon:2023:EEM,
  author =       "Vignesh V Menon and Hadi Amirpour and Mohammad
                 Ghanbari and Christian Timmerer",
  title =        "{EMES}: Efficient Multi-encoding Schemes for
                 {HEVC}-based Adaptive Bitrate Streaming",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "129:1--129:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3575659",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3575659",
  abstract =     "In HTTP Adaptive Streaming (HAS), videos are encoded
                 at multiple bitrates and spatial resolutions (i.e.,
                 representations ) to adapt to the heterogeneity of
                 network \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "129",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:VAC,
  author =       "Jiwei Zhang and Yi Yu and Suhua Tang and Jianming Wu
                 and Wei Li",
  title =        "Variational Autoencoder with {CCA} for Audio-Visual
                 Cross-modal Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "130:1--130:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3575658",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3575658",
  abstract =     "Cross-modal retrieval is to utilize one modality as a
                 query to retrieve data from another modality, which has
                 become a popular topic in information \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "130",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Le:2023:SAV,
  author =       "Thi-Ngoc-Hanh Le and Ya-Hsuan Chen and Tong-Yee Lee",
  title =        "Structure-aware Video Style Transfer with Map Art",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "131:1--131:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572030",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572030",
  abstract =     "Changing the style of an image/video while preserving
                 its content is a crucial criterion to access a new
                 neural style transfer algorithm. However, it is very
                 challenging to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "131",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2023:PMT,
  author =       "Sirui Zhao and Hongyu Jiang and Hanqing Tao and Rui
                 Zha and Kun Zhang and Tong Xu and Enhong Chen",
  title =        "{PEDM}: a Multi-task Learning Model for Persona-aware
                 Emoji-embedded Dialogue Generation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "132:1--132:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571819",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571819",
  abstract =     "As a vivid and linguistic symbol, Emojis have become a
                 prevailing medium interspersed in text-based
                 communication (e.g., social media and chit-chat) to
                 express \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "132",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hung:2023:FCN,
  author =       "Heyu Huang and Runmin Cong and Lianhe Yang and Ling Du
                 and Cong Wang and Sam Kwong",
  title =        "Feedback Chain Network for Hippocampus Segmentation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "133:1--133:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571744",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571744",
  abstract =     "The hippocampus plays a vital role in the diagnosis
                 and treatment of many neurological disorders. Recent
                 years, deep learning technology has made \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yao:2023:CRA,
  author =       "Xuanrong Yao and Xin Wang and Yue Liu and Wenwu Zhu",
  title =        "Continual Recognition with Adaptive Memory Update",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "134:1--134:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3573202",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3573202",
  abstract =     "Class incremental continual learning aims to improve
                 the ability of modern classification models to
                 continually recognize new classes without forgetting
                 the previous \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "134",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:AAM,
  author =       "Jingyao Wang and Luntian Mou and Lei Ma and Tiejun
                 Huang and Wen Gao",
  title =        "{AMSA}: Adaptive Multimodal Learning for Sentiment
                 Analysis",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "135:1--135:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572915",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572915",
  abstract =     "Efficient recognition of emotions has attracted
                 extensive research interest, which makes new
                 applications in many fields possible, such as
                 human-computer \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "135",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2023:JAC,
  author =       "Shaoning Zeng and Yunbo Rao and Bob Zhang and Yong
                 Xu",
  title =        "Joint Augmented and Compressed Dictionaries for Robust
                 Image Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "136:1--136:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572910",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572910",
  abstract =     "Dictionary-based Classification (DC) has been a
                 promising learning theory in multimedia computing.
                 Previous studies focused on learning a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "136",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wanyan:2023:DSG,
  author =       "Yuyang Wanyan and Xiaoshan Yang and Xuan Ma and
                 Changsheng Xu",
  title =        "Dual Scene Graph Convolutional Network for Motivation
                 Prediction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "137:1--137:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572914",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572914",
  abstract =     "Humans can easily infer the motivations behind human
                 actions from only visual data by comprehensively
                 analyzing the complex context information and utilizing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "137",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lei:2023:LUD,
  author =       "Fei Lei and Zhongqi Cao and Yuning Yang and Yibo Ding
                 and Cong Zhang",
  title =        "Learning the User's Deeper Preferences for Multi-modal
                 Recommendation Systems",
  journal =      j-TOMM,
  volume =       "19",
  number =       "3s",
  pages =        "138:1--138:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3573010",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3573010",
  abstract =     "Recommendation system plays an important role in the
                 rapid development of micro-video sharing platform.
                 Micro-video has rich modal features, such as visual,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "138",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2023:FDP,
  author =       "Xuehu Yan and Longlong Li and Lei Sun and Jia Chen and
                 Shudong Wang",
  title =        "Fake and Dishonest Participant Immune Secret Image
                 Sharing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "139:1--139:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572842",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572842",
  abstract =     "Secret image sharing (SIS) has received increased
                 attention from the research community because of its
                 usefulness in multiparty secure computing, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "139",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:SCF,
  author =       "Song Yang and Qiang Li and Wenhui Li and Xuan-Ya Li
                 and Ran Jin and Bo Lv and Rui Wang and Anan Liu",
  title =        "Semantic Completion and Filtration for Image-Text
                 Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "140:1--140:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572844",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572844",
  abstract =     "Image-text retrieval is a vital task in computer
                 vision and has received growing attention, since it
                 connects cross-modality data. It comes with the
                 critical \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "140",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ma:2023:MSK,
  author =       "Xuan Ma and Xiaoshan Yang and Changsheng Xu",
  title =        "Multi-Source Knowledge Reasoning Graph Network for
                 Multi-Modal Commonsense Inference",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "141:1--141:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3573201",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3573201",
  abstract =     "As a crucial part of natural language processing,
                 event-centered commonsense inference task has attracted
                 increasing attention. With a given observed \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "141",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:APA,
  author =       "Shangxi Wu and Jitao Sang and Kaiyuan Xu and Jiaming
                 Zhang and Jian Yu",
  title =        "Attention, Please! {Adversarial} Defense via
                 Activation Rectification and Preservation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "142:1--142:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572843",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572843",
  abstract =     "This study provides a new understanding of the
                 adversarial attack problem by examining the correlation
                 between adversarial attack and visual attention change.
                 In \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "142",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:CSA,
  author =       "Kan Wang and Changxing Ding and Jianxin Pang and
                 Xiangmin Xu",
  title =        "Context Sensing Attention Network for Video-based
                 Person Re-identification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "143:1--143:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3573203",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3573203",
  abstract =     "Video-based person re-identification (ReID) is
                 challenging due to the presence of various
                 interferences in video frames. Recent approaches handle
                 this problem \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "143",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:SSL,
  author =       "Wenjing Wang and Lilang Lin and Zejia Fan and Jiaying
                 Liu",
  title =        "Semi-supervised Learning for {Mars} Imagery
                 Classification and Segmentation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "144:1--144:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572916",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572916",
  abstract =     "With the progress of Mars exploration, numerous Mars
                 image data are being collected and need to be analyzed.
                 However, due to the severe train-test gap \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "144",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:DDD,
  author =       "Hui Liu and Shanshan Li and Jicheng Zhu and Kai Deng
                 and Meng Liu and Liqiang Nie",
  title =        "{DDIFN}: a Dual-discriminator Multi-modal Medical
                 Image Fusion Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "145:1--145:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3574136",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3574136",
  abstract =     "Multi-modal medical image fusion is a long-standing
                 important research topic that can obtain informative
                 medical images and assist doctors diagnose and treat
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "145",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2023:DGD,
  author =       "Xintian Wu and Huanyu Wang and Yiming Wu and Xi Li",
  title =        "{D$^3$T-GAN}: Data-Dependent Domain Transfer {GANs}
                 for Image Generation with Limited Data",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "146:1--146:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3576858",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576858",
  abstract =     "As an important and challenging problem, image
                 generation with limited data aims at generating
                 realistic images through training a GAN model given few
                 samples. A \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "146",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:NLA,
  author =       "Dandan Zhu and Xuan Shao and Qiangqiang Zhou and
                 Xiongkuo Min and Guangtao Zhai and Xiaokang Yang",
  title =        "A Novel Lightweight Audio-visual Saliency Model for
                 Videos",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "147:1--147:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3576857",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576857",
  abstract =     "Audio information has not been considered an important
                 factor in visual attention models regardless of many
                 psychological studies that have shown the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "147",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Abdussalam:2023:NNC,
  author =       "Amr Abdussalam and Zhongfu Ye and Ammar Hawbani and
                 Majjed Al-Qatf and Rashid Khan",
  title =        "{NumCap}: a Number-controlled Multi-caption Image
                 Captioning Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "148:1--148:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3576927",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576927",
  abstract =     "Image captioning is a promising task that attracted
                 researchers in the last few years. Existing image
                 captioning models are primarily trained to generate one
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "148",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:DML,
  author =       "Hao Liu and Zhaoyu Yan and Bing Liu and Jiaqi Zhao and
                 Yong Zhou and Abdulmotaleb {El Saddik}",
  title =        "Distilled Meta-learning for Multi-Class Incremental
                 Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "149:1--149:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3576045",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576045",
  abstract =     "Meta-learning approaches have recently achieved
                 promising performance in multi-class incremental
                 learning. However, meta-learners still suffer from
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "149",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2023:GAT,
  author =       "Jin Yuan and Shikai Chen and Yao Zhang and Zhongchao
                 Shi and Xin Geng and Jianping Fan and Yong Rui",
  title =        "Graph Attention Transformer Network for Multi-label
                 Image Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "150:1--150:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578518",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578518",
  abstract =     "Multi-label classification aims to recognize multiple
                 objects or attributes from images. The key to solving
                 this issue relies on effectively characterizing the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "150",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hou:2023:UUI,
  author =       "Guojia Hou and Yuxuan Li and Huan Yang and Kunqian Li
                 and Zhenkuan Pan",
  title =        "{UID2021}: an Underwater Image Dataset for Evaluation
                 of No-Reference Quality Assessment Metrics",
  journal =      j-TOMM,
  volume =       "19",
  number =       "4",
  pages =        "151:1--151:??",
  month =        jul,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578584",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Jun 22 10:29:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578584",
  abstract =     "Achieving subjective and objective quality assessment
                 of underwater images is of high significance in
                 underwater visual perception and image/video
                 processing. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "151",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Carlsson:2023:CUS,
  author =       "Niklas Carlsson and Derek Eager",
  title =        "Cross-User Similarities in Viewing Behavior for
                 360${}^\circ $ Video and Caching Implications",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "152:1--152:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3507917",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3507917",
  abstract =     "The demand and usage of 360${}^\circ $ video services
                 are expected to increase. However, despite these
                 services being highly bandwidth intensive, not much is
                 known about the potential value that basic bandwidth
                 saving techniques such as server or edge-network on-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "152",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:EEH,
  author =       "Ziqiang Li and Pengfei Xia and Xue Rui and Bin Li",
  title =        "Exploring the Effect of High-frequency Components in
                 {GANs} Training",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "153:1--153:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578585",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578585",
  abstract =     "Generative Adversarial Networks (GANs) have the
                 ability to generate images that are visually
                 indistinguishable from real images. However, recent
                 studies have revealed that generated and real images
                 share significant differences in the frequency domain.
                 In \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "153",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yin:2023:FFM,
  author =       "Haibing Yin and Hongkui Wang and Li Yu and Junhui
                 Liang and Guangtao Zhai",
  title =        "Feedforward and Feedback Modulations Based Foveated
                 {JND} Estimation for Images",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "154:1--154:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579094",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579094",
  abstract =     "The just noticeable difference (JND) reveals the key
                 characteristic of visual perception, which has been
                 widely used in many perception-based image and video
                 applications. Nevertheless, the modulatory mechanism of
                 the human visual system (HVS) has not \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "154",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:MID,
  author =       "Taocun Yang and Yaping Huang and Yanlin Xie and Junbo
                 Liu and Shengchun Wang",
  title =        "{MixOOD}: Improving Out-of-distribution Detection with
                 Enhanced Data Mixup",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "155:1--155:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578935",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578935",
  abstract =     "Detecting out-of-distribution (OOD) inputs for deep
                 learning models is a critical task when models are
                 deployed in real-world environments. Recently, a large
                 number of works have been dedicated to tackling the OOD
                 detection problem. One of the most \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "155",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wei:2023:MLC,
  author =       "Hao Wei and Rui Chen",
  title =        "A Multi-Level Consistency Network for High-Fidelity
                 Virtual Try-On",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "156:1--156:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580500",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580500",
  abstract =     "The 2D virtual try-on task aims to transfer a target
                 clothing image to the corresponding region of a person
                 image. Although an extensive amount of research has
                 been conducted due to its immense applications, this
                 task still remains a great challenge to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "156",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hao:2023:FGT,
  author =       "Jiachang Hao and Haifeng Sun and Pengfei Ren and
                 Yiming Zhong and Jingyu Wang and Qi Qi and Jianxin
                 Liao",
  title =        "Fine-Grained Text-to-Video Temporal Grounding from
                 Coarse Boundary",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "157:1--157:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579825",
  abstract =     "Text-to-video temporal grounding aims to locate a
                 target video moment that semantically corresponds to
                 the given sentence query in an untrimmed video. In this
                 task, fully supervised works require text descriptions
                 for each event along with its temporal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "157",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:DLH,
  author =       "Weixin Li and Tiantian Cao and Chang Liu and Xue Tian
                 and Ya Li and Xiaojie Wang and Xuan Dong",
  title =        "Dual-Lens {HDR} using Guided {$3$D} Exposure {CNN} and
                 Guided Denoising Transformer",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "158:1--158:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579167",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579167",
  abstract =     "We study the high dynamic range (HDR) imaging problem
                 in dual-lens systems. Existing methods usually treat
                 the HDR imaging problem as an image fusion problem and
                 the HDR result is estimated by fusing the aligned short
                 exposure image and long exposure \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "158",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2023:HHF,
  author =       "Xin Yang and Hengrui Li and Xiaochuan Li and Tao Li",
  title =        "{HIFGAN}: a High-Frequency Information-Based
                 Generative Adversarial Network for Image
                 Super-Resolution",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "159:1--159:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578934",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578934",
  abstract =     "Since the neural network was introduced into the
                 super-resolution (SR) field, many SR deep models have
                 been proposed and have achieved excellent results.
                 However, there are two main drawbacks: one is that the
                 methods based on the best peak-signal-to-noise
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "159",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:DMO,
  author =       "Yang Li",
  title =        "Detection of Moving Object Using Superpixel Fusion
                 Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "160:1--160:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579998",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579998",
  abstract =     "Moving object detection is still a challenging task in
                 complex scenes. The existing methods based on deep
                 learning mainly use U-Nets and have achieved amazing
                 results. However, they ignore the local continuity
                 between pixels. In order to solve this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "160",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Pan:2023:BTO,
  author =       "Yingwei Pan and Yehao Li and Ting Yao and Tao Mei",
  title =        "Bottom-up and Top-down Object Inference Networks for
                 Image Captioning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "161:1--161:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580366",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580366",
  abstract =     "A bottom-up and top-down attention mechanism has led
                 to the revolutionizing of image captioning techniques,
                 which enables object-level attention for multi-step
                 reasoning over all the detected objects. However, when
                 humans describe an image, they often \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "161",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2023:MMK,
  author =       "Duoduo Feng and Xiangteng He and Yuxin Peng",
  title =        "{MKVSE}: Multimodal Knowledge Enhanced Visual-semantic
                 Embedding for Image-text Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "162:1--162:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580501",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580501",
  abstract =     "Image-text retrieval aims to take the text (image)
                 query to retrieve the semantically relevant images
                 (texts), which is fundamental and critical in the
                 search system, online shopping, and social network.
                 Existing works have shown the effectiveness of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "162",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2023:BTG,
  author =       "Mengyi Zhao and Hao Tang and Pan Xie and Shuling Dai
                 and Nicu Sebe and Wei Wang",
  title =        "Bidirectional Transformer {GAN} for Long-term Human
                 Motion Prediction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "163:1--163:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579359",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579359",
  abstract =     "The mainstream motion prediction methods usually focus
                 on short-term prediction, and their predicted long-term
                 motions often fall into an average pose, i.e., the
                 freezing forecasting problem [ 27 ]. To mitigate this
                 problem, we propose a novel Bidirectional \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "163",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:RVS,
  author =       "Jian Wang and Qiang Ling and Peiyan Li",
  title =        "Robust Video Stabilization based on Motion
                 Decomposition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5",
  pages =        "164:1--164:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580498",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 07:03:55 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580498",
  abstract =     "Video stabilization aims to eliminate camera jitter
                 and improve the visual experience of shaky videos.
                 Video stabilization methods often ignore the active
                 movement of the foreground objects and the camera, and
                 may result in distortion and over-smoothing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "164",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Franti:2023:DPC,
  author =       "Pasi Fr{\"a}nti and Nancy Fazal",
  title =        "Design Principles for Content Creation in
                 Location-Based Games",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "165:1--165:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583689",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583689",
  abstract =     "Location-based games have been around since 2000
                 across various fields, including education, health, and
                 entertainment. The main challenge facing such games
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "165",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2023:VNB,
  author =       "Chenchi Zhang and Wenbo Ma and Jun Xiao and Hanwang
                 Zhang and Jian Shao and Yueting Zhuang and Long Chen",
  title =        "{VL-NMS}: Breaking Proposal Bottlenecks in Two-stage
                 Visual-language Matching",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "166:1--166:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579095",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579095",
  abstract =     "The prevailing framework for matching multimodal
                 inputs is based on a two-stage process: (1) detecting
                 proposals with an object detector and (2) matching
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "166",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mackowski:2023:MPI,
  author =       "Micha{\l} Ma{\'c}kowski and Piotr Brzoza and Mateusz
                 Kawulok and Rafa{\l} Meisel and Dominik Spinczyk",
  title =        "Multimodal Presentation of Interactive Audio-Tactile
                 Graphics Supporting the Perception of Visual
                 Information by Blind People",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "167:1--167:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3586076",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3586076",
  abstract =     "Due to the limitations in the perception of graphical
                 information by blind people and the need to substitute
                 the sense of sight with other senses, the correct use
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "167",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Man:2023:TTE,
  author =       "Xin Man and Jie Shao and Feiyu Chen and Mingxing Zhang
                 and Heng Tao Shen",
  title =        "{TEVL}: Trilinear Encoder for Video-language
                 Representation Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "168:1--168:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3585388",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3585388",
  abstract =     "Pre-training model on large-scale unlabeled web videos
                 followed by task-specific fine-tuning is a canonical
                 approach to learning video and language \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "168",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ricci:2023:MLA,
  author =       "Simone Ricci and Tiberio Uricchio and Alberto {Del
                 Bimbo}",
  title =        "Meta-learning Advisor Networks for Long-tail and Noisy
                 Labels in Social Image Classification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "169:1--169:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584360",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584360",
  abstract =     "Deep neural networks (DNNs) for social image
                 classification are prone to performance reduction and
                 overfitting when trained on datasets plagued by
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "169",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:LBR,
  author =       "Chen Li and Li Song and Rong Xie and Wenjun Zhang",
  title =        "Local Bidirection Recurrent Network for Efficient
                 Video Deblurring with the Fused Temporal Merge Module",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "170:1--170:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587468",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587468",
  abstract =     "Video deblurring methods exploit the correlation
                 between consecutive blurry inputs to generate sharp
                 frames. However, designing an effective and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "170",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Niu:2023:VCL,
  author =       "Tian-Zi Niu and Zhen-Duo Chen and Xin Luo and Peng-Fei
                 Zhang and Zi Huang and Xin-Shun Xu",
  title =        "Video Captioning by Learning from Global Sentence and
                 Looking Ahead",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "171:1--171:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587252",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587252",
  abstract =     "Video captioning aims to automatically generate
                 natural language sentences describing the content of a
                 video. Although encoder-decoder-based models \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "171",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:GAE,
  author =       "Yang Wang and Bo Dong and Ke Xu and Haiyin Piao and
                 Yufei Ding and Baocai Yin and Xin Yang",
  title =        "A Geometrical Approach to Evaluate the Adversarial
                 Robustness of Deep Neural Networks",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "172:1--172:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587936",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587936",
  abstract =     "Deep neural networks (DNNs) are widely used for
                 computer vision tasks. However, it has been shown that
                 deep models are vulnerable to adversarial \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "172",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiang:2023:LML,
  author =       "Suncheng Xiang and Dahong Qian and Mengyuan Guan and
                 Binjie Yan and Ting Liu and Yuzhuo Fu and Guanjie You",
  title =        "Less Is More: Learning from Synthetic Data with
                 Fine-Grained Attributes for Person Re-Identification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "173:1--173:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588441",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588441",
  abstract =     "Person re-identification (ReID) plays an important
                 role in applications such as public security and video
                 surveillance. Recently, learning from synthetic data [
                 9 ], \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "173",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Siekkinen:2023:NNA,
  author =       "Matti Siekkinen and Teemu K{\"a}m{\"a}r{\"a}inen",
  title =        "Neural Network Assisted Depth Map Packing for
                 Compression Using Standard Hardware Video Codecs",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "174:1--174:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588440",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588440",
  abstract =     "Depth maps are needed by various graphics rendering
                 and processing operations. Depth map streaming is often
                 necessary when such operations are performed in a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "174",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{vanRensburg:2023:OWD,
  author =       "Bianca Jansen van Rensburg and Pauline Puteaux and
                 William Puech and Jean-Pierre Pedeboy",
  title =        "{$3$D} Object Watermarking from Data Hiding in the
                 Homomorphic Encrypted Domain",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "175:1--175:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588573",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588573",
  abstract =     "For over a decade, 3D objects are an increasingly
                 popular form of media. It has become necessary and
                 urgent to secure them during their transmission or
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "175",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:CSR,
  author =       "Hao Liu and Xiaoshan Yang and Changsheng Xu",
  title =        "Counterfactual Scenario-relevant Knowledge-enriched
                 Multi-modal Emotion Reasoning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "176:1--176:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583690",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583690",
  abstract =     "Multi-modal video emotion reasoning (MERV) has
                 recently attracted increasing attention due to its
                 potential application in human-computer interaction.
                 This \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "176",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ayoughi:2023:SCE,
  author =       "Melika Ayoughi and Pascal Mettes and Paul Groth",
  title =        "Self-contained Entity Discovery from Captioned
                 Videos",
  journal =      j-TOMM,
  volume =       "19",
  number =       "5s",
  pages =        "177:1--177:??",
  month =        oct,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583138",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Mon Jul 3 08:37:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583138",
  abstract =     "This article introduces the task of visual named
                 entity discovery in videos without the need for
                 task-specific supervision or task-specific external
                 knowledge \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "177",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xie:2023:CFP,
  author =       "Jin Xie and Yanwei Pang and Jing Pan and Jing Nie and
                 Jiale Cao and Jungong Han",
  title =        "Complementary Feature Pyramid Network for Object
                 Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "178:1--178:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584362",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584362",
  abstract =     "The way of constructing a robust feature pyramid is
                 crucial for object detection. However, existing feature
                 pyramid methods, which aggregate multi-level features
                 by using element-wise sum or concatenation, are
                 inefficient to construct a robust feature \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "178",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:DCP,
  author =       "Tianyi Wang and Harry Cheng and Kam Pui Chow and
                 Liqiang Nie",
  title =        "Deep Convolutional Pooling Transformer for Deepfake
                 Detection",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "179:1--179:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588574",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588574",
  abstract =     "Recently, Deepfake has drawn considerable public
                 attention due to security and privacy concerns in
                 social media digital forensics. As the wildly spreading
                 Deepfake videos on the Internet become more realistic,
                 traditional detection techniques have failed \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "179",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chan:2023:LDF,
  author =       "Patrick P. K. Chan and Xiaoman Hu and Haorui Song and
                 Peng Peng and Keke Chen",
  title =        "Learning Disentangled Features for Person
                 Re-identification under Clothes Changing",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "180:1--180:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584359",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584359",
  abstract =     "Clothes changing is one of the challenges in person
                 re-identification (ReID), since clothes provide
                 remarkable and reliable information for decision,
                 especially when the resolution of an image is low.
                 Variation of clothes significantly downgrades standard
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "180",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2023:CFG,
  author =       "Rongfei Zeng and Mai Su and Ruiyun Yu and Xingwei
                 Wang",
  title =        "{CD$^2$}: Fine-grained {$3$D} Mesh Reconstruction with
                 Twice Chamfer Distance",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "181:1--181:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3582694",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3582694",
  abstract =     "Monocular 3D reconstruction is to reconstruct the
                 shape of object and its other information from a single
                 RGB image. In 3D reconstruction, polygon mesh, with
                 detailed surface information and low computational
                 cost, is the most prevalent expression form \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "181",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Niu:2023:SEV,
  author =       "Tian-Zi Niu and Shan-Shan Dong and Zhen-Duo Chen and
                 Xin Luo and Shanqing Guo and Zi Huang and Xin-Shun Xu",
  title =        "Semantic Enhanced Video Captioning with Multi-feature
                 Fusion",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "182:1--182:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588572",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588572",
  abstract =     "Video captioning aims to automatically describe a
                 video clip with informative sentences. At present, deep
                 learning-based models have become the mainstream for
                 this task and achieved competitive results on public
                 datasets. Usually, these methods leverage \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "182",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:TBV,
  author =       "Kun Li and Jiaxiu Li and Dan Guo and Xun Yang and Meng
                 Wang",
  title =        "Transformer-Based Visual Grounding with Cross-Modality
                 Interaction",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "183:1--183:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587251",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587251",
  abstract =     "This article tackles the challenging yet important
                 task of Visual Grounding (VG), which aims to localize a
                 visual region in the given image referred by a natural
                 language query. Existing efforts on the VG task are
                 twofold: (1) two-stage methods first \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "183",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xie:2023:VPG,
  author =       "Jiayuan Xie and Jiali Chen and Yi Cai and Qingbao
                 Huang and Qing Li",
  title =        "Visual Paraphrase Generation with Key Information
                 Retained",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "184:1--184:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3585010",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3585010",
  abstract =     "Visual paraphrase generation task aims to rewrite a
                 given image-related original sentence into a new
                 paraphrase, where the paraphrase needs to have the same
                 expressed meaning as the original sentence but have a
                 difference in expression form. Existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "184",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2023:NVS,
  author =       "Bingzheng Liu and Jianjun Lei and Bo Peng and Chuanbo
                 Yu and Wanqing Li and Nam Ling",
  title =        "Novel View Synthesis from a Single Unposed Image via
                 Unsupervised Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "186:1--186:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587467",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587467",
  abstract =     "Novel view synthesis aims to generate novel views from
                 one or more given source views. Although existing
                 methods have achieved promising performance, they
                 usually require paired views with different poses to
                 learn a pixel transformation. This article \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "186",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2023:LLI,
  author =       "Mingliang Zhou and Hongyue Leng and Bin Fang and Tao
                 Xiang and Xuekai Wei and Weijia Jia",
  title =        "Low-light Image Enhancement via a Frequency-based
                 Model with Structure and Texture Decomposition",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "187:1--187:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3590965",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3590965",
  abstract =     "This article proposes a frequency-based structure and
                 texture decomposition model in a Retinex-based
                 framework for low-light image enhancement and noise
                 suppression. First, we utilize the total
                 variation-based noise estimation to decompose the
                 observed \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "187",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:AAM,
  author =       "Hongguang Zhu and Yunchao Wei and Yao Zhao and Chunjie
                 Zhang and Shujuan Huang",
  title =        "{AMC}: Adaptive Multi-expert Collaborative Network for
                 Text-guided Image Retrieval",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "188:1--188:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584703",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584703",
  abstract =     "Text-guided image retrieval integrates reference image
                 and text feedback as a multimodal query to search the
                 image corresponding to user intention. Recent
                 approaches employ multi-level matching, multiple
                 accesses, or multiple subnetworks for better \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "188",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fontanini:2023:UDM,
  author =       "Tomaso Fontanini and Luca Donati and Massimo Bertozzi
                 and Andrea Prati",
  title =        "Unsupervised Discovery and Manipulation of Continuous
                 Disentangled Factors of Variation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "189:1--189:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3591358",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3591358",
  abstract =     "Learning a disentangled representation of a
                 distribution in a completely unsupervised way is a
                 challenging task that has drawn attention recently. In
                 particular, much focus has been put in separating
                 factors of variation (i.e., attributes) within the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "189",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kumar:2023:AFS,
  author =       "Puneet Kumar and Gaurav Bhatt and Omkar Ingle and
                 Daksh Goyal and Balasubramanian Raman",
  title =        "Affective Feedback Synthesis Towards Multimodal Text
                 and Image Data",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "190:1--190:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3589186",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3589186",
  abstract =     "In this article, we have defined a novel task of
                 affective feedback synthesis that generates feedback
                 for input text and corresponding images in a way
                 similar to humans responding to multimodal data. A
                 feedback synthesis system has been proposed and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "190",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:AAS,
  author =       "Yikun Xu and Xingxing Wei and Pengwen Dai and Xiaochun
                 Cao",
  title =        "{A$^2$SC}: Adversarial Attacks on Subspace
                 Clustering",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "191:1--191:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3587097",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3587097",
  abstract =     "Many studies demonstrate that supervised learning
                 techniques are vulnerable to adversarial examples.
                 However, adversarial threats in unsupervised learning
                 have not drawn sufficient scholarly attention. In this
                 article, we formally address the unexplored \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "191",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2023:DTV,
  author =       "Xianhua Zeng and Saiyuan Chen and Yicai Xie and
                 Tianxing Liao",
  title =        "{3V3D}: Three-View Contextual Cross-slice Difference
                 Three-dimensional Medical Image Segmentation
                 Adversarial Network",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "192:1--192:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3592614",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3592614",
  abstract =     "In three-dimensional (3D) medical image segmentation,
                 it is still a great challenge to obtain the
                 multidimensional feature information contained in voxel
                 images using a single view for smaller segmentation
                 targets, and the robustness of models obtained by
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "192",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Becattini:2023:VLS,
  author =       "Federico Becattini and Pietro Bongini and Luana Bulla
                 and Alberto {Del Bimbo} and Ludovica Marinucci and
                 Misael Mongiov{\`\i} and Valentina Presutti",
  title =        "{VISCOUNTH}: a Large-scale Multilingual Visual
                 Question Answering Dataset for Cultural Heritage",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "193:1--193:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3590773",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3590773",
  abstract =     "Visual question answering has recently been settled as
                 a fundamental multi-modal reasoning task of artificial
                 intelligence that allows users to get information about
                 visual content by asking questions in natural language.
                 In the cultural heritage domain, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "193",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hsu:2023:RMS,
  author =       "Wei-Yen Hsu and Pei-Wen Jian",
  title =        "Recurrent Multi-scale Approximation-Guided Network for
                 Single Image Super-Resolution",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "194:1--194:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3592613",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3592613",
  abstract =     "Single-image super-resolution (SISR) is an essential
                 topic in computer vision applications. However, most
                 CNN-based SISR approaches directly learn the
                 relationship between low- and high-resolution images
                 while ignoring the contextual texture and detail
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "194",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:HAW,
  author =       "Bo Li and Yong Zhang and Chengyang Zhang and Xinglin
                 Piao and Baocai Yin",
  title =        "Hypergraph Association Weakly Supervised Crowd
                 Counting",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "195:1--195:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3594670",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3594670",
  abstract =     "Weakly supervised crowd counting involves the
                 regression of the number of individuals present in an
                 image, using only the total number as the label.
                 However, this task is plagued by two primary
                 challenges: the large variation of head size and uneven
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "195",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tai:2023:MAS,
  author =       "Yichun Tai and Hailin Shi and Dan Zeng and Hang Du and
                 Yibo Hu and Zicheng Zhang and Zhijiang Zhang and Tao
                 Mei",
  title =        "Multi-Agent Semi-{Siamese} Training for Long-Tail and
                 Shallow Face Learning",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "196:1--196:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3594669",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3594669",
  abstract =     "With the recent development of deep convolutional
                 neural networks and large-scale datasets, deep face
                 recognition has made remarkable progress and been
                 widely used in various applications. However, unlike
                 the existing public face datasets, in many real-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "196",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2023:PEE,
  author =       "Rui Li and Baopeng Zhang and Wei Liu and Zhu Teng and
                 Jianping Fan",
  title =        "{PANet}: an End-to-end Network Based on Relative
                 Motion for Online Multi-object Tracking",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "197:1--197:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595379",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595379",
  abstract =     "The popular tracking-by-detection paradigm of
                 multi-object tracking (MOT) takes detections of each
                 frame as the input and associates detections from one
                 frame to another. Existing association methods based on
                 the relative motion have attracted attention,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "197",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2023:SBD,
  author =       "Ye Yuan and Jiawan Zhang",
  title =        "Shot Boundary Detection Using Color Clustering and
                 Attention Mechanism",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "198:1--198:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595923",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595923",
  abstract =     "Shot boundary detection (SBD) is widely used in scene
                 segmentation, semantic analysis, and video retrieval.
                 However, existing SBD algorithms have certain
                 applications in video processing, but they have the
                 following three problems. First, these algorithms
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "198",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2023:TIS,
  author =       "Cong Huang and Xiulian Peng and Dong Liu and Yan Lu",
  title =        "Text Image Super-Resolution Guided by Text Structure
                 and Embedding Priors",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "199:1--199:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595924",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595924",
  abstract =     "We aim to super-resolve text images from
                 unrecognizable low-resolution inputs. Existing
                 super-resolution methods mainly learn a direct mapping
                 from low-resolution to high-resolution images by
                 exploring low-level features, which usually generate
                 blurry \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "199",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhu:2023:MLR,
  author =       "Jie Zhu and Bo Peng and Wanqing Li and Haifeng Shen
                 and Qingming Huang and Jianjun Lei",
  title =        "Modeling Long-range Dependencies and Epipolar Geometry
                 for Multi-view Stereo",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "200:1--200:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3596445",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3596445",
  abstract =     "This article proposes a network, referred to as
                 Multi-View Stereo TRansformer (MVSTR) for depth
                 estimation from multi-view images. By modeling
                 long-range dependencies and epipolar geometry, the
                 proposed MVSTR is capable of extracting dense features
                 with \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "200",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:IFD,
  author =       "Xiumei Chen and Xiangtao Zheng and Xiaoqiang Lu",
  title =        "Identity Feature Disentanglement for Visible-Infrared
                 Person Re-Identification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "201:1--201:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595183",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595183",
  abstract =     "Visible-infrared person re-identification (VI-ReID)
                 task aims to retrieve persons from different spectrum
                 cameras (i.e., visible and infrared images). The
                 biggest challenge of VI-ReID is the huge cross-modal
                 discrepancy caused by different imaging \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "201",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shu:2023:CAP,
  author =       "Zhenyu Shu and Ling Gao and Shun Yi and Fangyu Wu and
                 Xin Ding and Ting Wan and Shiqing Xin",
  title =        "Context-Aware {$3$D} Points of Interest Detection via
                 Spatial Attention Mechanism",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "202:1--202:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597026",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597026",
  abstract =     "Detecting points of interest is a fundamental problem
                 in 3D shape analysis and can be beneficial to various
                 tasks in multimedia processing. Traditional
                 learning-based detection methods usually rely on each
                 vertex's geometric features to discriminate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "202",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2023:CCF,
  author =       "Zhen Chen and Ming Yang and Shiliang Zhang",
  title =        "Complementary Coarse-to-Fine Matching for Video Object
                 Segmentation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "203:1--203:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3596496",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3596496",
  abstract =     "Semi-supervised Video Object Segmentation (VOS) needs
                 to establish pixel-level correspondences between a
                 video frame and preceding segmented frames to leverage
                 their segmentation clues. Most works rely on features
                 at a single scale to establish those \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "203",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Srinivas:2023:CBN,
  author =       "Kankanala Srinivas and Ashish Kumar Bhandari",
  title =        "Context-Based Novel Histogram Bin Stretching Algorithm
                 for Automatic Contrast Enhancement",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "204:1--204:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597303",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597303",
  abstract =     "This article presents CHBS, a novel context-based
                 histogram bin stretching method that enhances the
                 contrast by increasing the range of gray levels and
                 randomness among the gray levels. It comprises image
                 spatial contextual information and discrete cosine
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "204",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2023:UDA,
  author =       "Zhenjun Tang and Zhiyuan Chen and Zhixin Li and Bineng
                 Zhong and Xianquan Zhang and Xinpeng Zhang",
  title =        "Unifying Dual-Attention and {Siamese} Transformer
                 Network for Full-Reference Image Quality Assessment",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "205:1--205:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597434",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597434",
  abstract =     "Image Quality Assessment (IQA) is a critical task of
                 computer vision. Most Full-Reference (FR) IQA methods
                 have limitation in the accurate prediction of
                 perceptual qualities of the traditional distorted
                 images and the Generative Adversarial Networks (GANs).
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "205",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tang:2023:LSR,
  author =       "Geyu Tang and Xingyu Gao and Zhenyu Chen",
  title =        "Learning Semantic Representation on Visual Attribute
                 Graph for Person Re-identification and Beyond",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "206:1--206:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487044",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487044",
  abstract =     "Person re-identification (re-ID) aims to match
                 pedestrian pairs captured from different cameras.
                 Recently, various attribute-based models have been
                 proposed to combine the pedestrian attribute as an
                 auxiliary semantic information to learn a more
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "206",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Deng:2023:LGL,
  author =       "Zijun Deng and Xiangteng He and Yuxin Peng",
  title =        "{LFR-GAN}: Local Feature Refinement based Generative
                 Adversarial Network for Text-to-Image Generation",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "207:1--207:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3589002",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3589002",
  abstract =     "Text-to-image generation aims to generate images from
                 text descriptions. Its main challenge lies in two
                 aspects: (1) Semantic consistency, i.e., the generated
                 images should be semantically consistent with the input
                 text; and (2) Visual reality, i.e., the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "207",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Du:2023:WSH,
  author =       "Yongchao Du and Min Wang and Zhenbo Lu and Wengang
                 Zhou and Houqiang Li",
  title =        "Weakly Supervised Hashing with Reconstructive
                 Cross-modal Attention",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "208:1--208:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3589185",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3589185",
  abstract =     "On many popular social websites, images are usually
                 associated with some meta-data such as textual tags,
                 which involve semantic information relevant to the
                 image and can be used to supervise the representation
                 learning for image retrieval. However, these \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "208",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2023:CSC,
  author =       "Meng Wang and Jizheng Xu and Li Zhang and Junru Li and
                 Kai Zhang and Shiqi Wang and Siwei Ma",
  title =        "Compressed Screen Content Image Super Resolution",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "209:1--209:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3589963",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3589963",
  abstract =     "Screen content has become one of the prominent mediums
                 in the increasingly connected world. With the
                 prevalence of remote collaboration and communication
                 such as virtual conferences and online education,
                 recent years have witnessed a dramatic increase in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "209",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2023:CUH,
  author =       "Boqiang Xu and Jian Liang and Lingxiao He and Jinlin
                 Wu and Chao Fan and Zhenan Sun",
  title =        "Color-Unrelated Head-Shoulder Networks for
                 Fine-Grained Person Re-identification",
  journal =      j-TOMM,
  volume =       "19",
  number =       "6",
  pages =        "210:1--210:??",
  month =        nov,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3599730",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:46 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3599730",
  abstract =     "Person re-identification (re-id) attempts to match
                 pedestrian images with the same identity across
                 non-overlapping cameras. Existing methods usually study
                 person re-id by learning discriminative features based
                 on the clothing attributes (e.g., color, \ldots{})",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "210",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2024:IBC,
  author =       "Zhenbo Xu and Hai-Miao Hu and Liu Liu and Dongping
                 Zhang and Shifeng Zhang and Wenming Tan",
  title =        "Instance-Based Continual Learning: a Real-World
                 Dataset and Baseline for Fresh Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3591209",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3591209",
  abstract =     "Real-time learning on real-world data streams with
                 temporal relations is essential for intelligent agents.
                 However, current online Continual Learning (CL)
                 benchmarks adopt the mini-batch setting and are
                 composed of temporally unrelated and disjoint tasks
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2024:RHG,
  author =       "Xiaoping Liang and Zhenjun Tang and Zhixin Li and
                 Mengzhu Yu and Hanyun Zhang and Xianquan Zhang",
  title =        "Robust Hashing via Global and Local Invariant Features
                 for Image Copy Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600234",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600234",
  abstract =     "Robust hashing is a powerful technique for processing
                 large-scale images. Currently, many reported image
                 hashing schemes do not perform well in balancing the
                 performances of discrimination and robustness, and thus
                 they cannot efficiently detect image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sarma:2024:DID,
  author =       "Sandipan Sarma and Arijit Sur",
  title =        "{DiRaC-I}: Identifying Diverse and Rare Training
                 Classes for Zero-Shot Learning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603147",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603147",
  abstract =     "Zero-Shot Learning (ZSL) is an extreme form of
                 transfer learning that aims at learning from a few
                 ``seen classes'' to have an understanding about the
                 ``unseen classes'' in the wild. Given a dataset in ZSL
                 research, most existing works use a predetermined,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2024:SSJ,
  author =       "Chengyu Zheng and Ning Song and Ruoyu Zhang and Lei
                 Huang and Zhiqiang Wei and Jie Nie",
  title =        "Scale-Semantic Joint Decoupling Network for Image-Text
                 Retrieval in Remote Sensing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603628",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603628",
  abstract =     "Image-text retrieval in remote sensing aims to provide
                 flexible information for data analysis and application.
                 In recent years, state-of-the-art methods are dedicated
                 to ``scale decoupling'' and ``semantic decoupling''
                 strategies to further enhance the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:ZSS,
  author =       "Jiankai Li and Yunhong Wang and Weixin Li",
  title =        "Zero-shot Scene Graph Generation via Triplet
                 Calibration and Reduction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3604284",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3604284",
  abstract =     "Scene Graph Generation (SGG) plays a pivotal role in
                 downstream vision-language tasks. Existing SGG methods
                 typically suffer from poor compositional
                 generalizations on unseen triplets. They are generally
                 trained on incompletely annotated scene graphs that
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yaqoob:2024:APT,
  author =       "Abid Yaqoob and Gabriel-Miro Muntean",
  title =        "Advanced Predictive Tile Selection Using Dynamic
                 Tiling for Prioritized 360${}^\circ $ Video {VR}
                 Streaming",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603146",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603146",
  abstract =     "The widespread availability of smart computing and
                 display devices such as mobile phones, gaming consoles,
                 laptops, and tethered/untethered head-mounted displays
                 has fueled an increase in demand for omnidirectional
                 (360${}^\circ $) videos. 360${}^\circ $ video
                 applications \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:LGR,
  author =       "Jia Wang and Hong-Han Shuai and Yung-Hui Li and
                 Wen-Huang Cheng",
  title =        "Language-guided Residual Graph Attention Network and
                 Data Augmentation for Visual Grounding",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3604557",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3604557",
  abstract =     "Visual grounding is an essential task in understanding
                 the semantic relationship between the given text
                 description and the target object in an image. Due to
                 the innate complexity of language and the rich semantic
                 context of the image, it is still a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:ACN,
  author =       "Haoran Wang and Yajie Wang and Baosheng Yu and Yibing
                 Zhan and Chunfeng Yuan and Wankou Yang",
  title =        "Attentional Composition Networks for Long-Tailed Human
                 Action Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603253",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603253",
  abstract =     "The problem of long-tailed visual recognition has been
                 receiving increasing research attention. However, the
                 long-tailed distribution problem remains underexplored
                 for video-based visual recognition. To address this
                 issue, in this article we propose a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:SSC,
  author =       "Zi-Chao Zhang and Zhen-Duo Chen and Zhen-Yu Xie and
                 Xin Luo and Xin-Shun Xu",
  title =        "{S3Mix}: Same Category Same Semantics Mixing for
                 Augmenting Fine-grained Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605892",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605892",
  abstract =     "Data augmentation is a common technique to improve the
                 generalization performance of models for image
                 classification. Although methods such as Mixup and
                 CutMix that mix images randomly are indeed instrumental
                 in general image classification, randomly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tan:2024:TBR,
  author =       "Mingkui Tan and Zhiquan Wen and Leyuan Fang and Qi
                 Wu",
  title =        "Transformer-Based Relational Inference Network for
                 Complex Visual Relational Reasoning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605781",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605781",
  abstract =     "Visual Relational Reasoning is the basis of many
                 vision-and-language based tasks (e.g., visual question
                 answering and referring expression comprehension). In
                 this article, we regard the complex referring
                 expression comprehension (c-REF) task as the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2024:SSL,
  author =       "Yiming Yang and Weipeng Hu and Haifeng Hu",
  title =        "Syncretic Space Learning Network for {NIR-VIS} Face
                 Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3607143",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3607143",
  abstract =     "To overcome the technical bottleneck of face
                 recognition in low-light scenarios, Near-InfraRed and
                 VISible (NIR-VIS) heterogeneous face recognition is
                 proposed for matching well-lit VIS faces with poorly
                 lit NIR faces. Current cross-modal synthesis \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:DWG,
  author =       "Chenghua Li and Zongze Li and Jing Sun and Yun Zhang
                 and Xiaoping Jiang and Fan Zhang",
  title =        "Dynamic Weighted Gradient Reversal Network for
                 Visible-infrared Person Re-identification",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3607535",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3607535",
  abstract =     "Due to intra-modality variations and cross-modality
                 discrepancy, visible-infrared person re-identification
                 (VI Re-ID) is an important and challenging task in
                 intelligent video surveillance. The cross-modality
                 discrepancy is mainly caused by the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Song:2024:TFI,
  author =       "Jiajun Song and Zhuo Li and Weiqing Min and Shuqiang
                 Jiang",
  title =        "Towards Food Image Retrieval via
                 Generalization-Oriented Sampling and Loss Function
                 Design",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3600095",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3600095",
  abstract =     "Food computing has increasingly received widespread
                 attention in the multimedia field. As a basic task of
                 food computing, food image retrieval has wide
                 applications, that is, food image retrieval can help
                 users to find the desired food from a large number
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jin:2024:CBN,
  author =       "Yiting Jin and Jie Wu and Wanliang Wang and Yidong Yan
                 and Jiawei Jiang and Jianwei Zheng",
  title =        "Cascading Blend Network for Image Inpainting",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "14:1--14:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3608952",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3608952",
  abstract =     "Image inpainting refers to filling in unknown regions
                 with known knowledge, which is in full flourish
                 accompanied by the popularity and prosperity of deep
                 convolutional networks. Current inpainting methods have
                 excelled in completing small-sized \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2024:DLS,
  author =       "Kehua Guo and Liang Chen and Xiangyuan Zhu and Xiaoyan
                 Kui and Jian Zhang and Heyuan Shi",
  title =        "Double-Layer Search and Adaptive Pooling Fusion for
                 Reference-Based Image Super-Resolution",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3604937",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3604937",
  abstract =     "Reference-based image super-resolution (RefSR) aims to
                 reconstruct high-resolution (HR) images from
                 low-resolution (LR) images by introducing HR reference
                 images. The key step of RefSR is to transfer reference
                 features to LR features. However, existing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2024:UOF,
  author =       "Jing Zhao and Bin Li and Jiahao Li and Ruiqin Xiong
                 and Yan Lu",
  title =        "A Universal Optimization Framework for Learning-based
                 Image Codec",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580499",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580499",
  abstract =     "Recently, machine learning-based image compression has
                 attracted increasing interests and is approaching the
                 state-of-the-art compression ratio. But unlike
                 traditional codec, it lacks a universal optimization
                 method to seek efficient representation for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:ICS,
  author =       "Liping Zhang and Shukai Chen and Fei Lin and Wei Ren
                 and Kim-Kwang Raymond Choo and Geyong Min",
  title =        "{$1$DIEN}: Cross-session Electrocardiogram
                 Authentication Using {$1$D} Integrated {EfficientNet}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "17:1--17:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3609800",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3609800",
  abstract =     "The potential of using electrocardiogram (ECG), an
                 important physiological signal for humans, as a new
                 biometric trait has been demonstrated, and ongoing
                 efforts have focused on utilizing deep learning (e.g.,
                 2D neural networks) to improve authentication
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:DMP,
  author =       "Baian Chen and Zhilei Chen and Xiaowei Hu and Jun Xu
                 and Haoran Xie and Jing Qin and Mingqiang Wei",
  title =        "Dynamic Message Propagation Network for {RGB-D} and
                 Video Salient Object Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597612",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597612",
  abstract =     "Exploiting long-range semantic contexts and geometric
                 information is crucial to infer salient objects from
                 RGB and depth features. However, existing methods
                 mainly focus on excavating local features within fixed
                 regions by continuously feeding forward \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gao:2024:SSM,
  author =       "Xiang Gao and Wei Hu and Guo-Jun Qi",
  title =        "Self-supervised Multi-view Learning via Auto-encoding
                 {$3$D} Transformations",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597613",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597613",
  abstract =     "3D object representation learning is a fundamental
                 challenge in computer vision to infer about the 3D
                 world. Recent advances in deep learning have shown
                 their efficiency in 3D object recognition, among which
                 view-based methods have performed best so far.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:EAE,
  author =       "Dewang Wang and Gaobo Yang and Zhiqing Guo and Jiyou
                 Chen",
  title =        "Enhancing Adversarial Embedding based Image
                 Steganography via Clustering Modification Directions",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603377",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603377",
  abstract =     "Image steganography is a technique used to conceal
                 secret information within cover images without being
                 detected. However, the advent of convolutional neural
                 networks (CNNs) has threatened the security of image
                 steganography. Due to the inherent \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2024:DHO,
  author =       "Xiaojia Zhao and Tingting Xu and Qiangqiang Shen and
                 Youfa Liu and Yongyong Chen and Jingyong Su",
  title =        "Double High-Order Correlation Preserved Robust
                 Multi-View Ensemble Clustering",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3612923",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3612923",
  abstract =     "Ensemble clustering (EC), utilizing multiple basic
                 partitions (BPs) to yield a robust consensus
                 clustering, has shown promising clustering performance.
                 Nevertheless, most current algorithms suffer from two
                 challenging hurdles: (1) a surge of EC-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tasaka:2024:UQM,
  author =       "Shuji Tasaka",
  title =        "Usefulness of {QoS} in Multidimensional {QoE}
                 Prediction for Haptic-Audiovisual Communications",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3613246",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3613246",
  abstract =     "This article investigates prediction of Quality of
                 Experience (QoE) by comparing borrowing-from-neighbor
                 situations and isolated ones. We demonstrate that joint
                 utilization of multiple QoE measures enhances the
                 accuracy of QoE prediction compared to that \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2024:EIC,
  author =       "Ching-Nung Yang and Xiaotian Wu and Min-Jung Chung",
  title =        "Enhancement of Information Carrying and Decoding for
                 Visual Cryptography with Error Correction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3612927",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3612927",
  abstract =     "Recently, three visual cryptography schemes with t
                 -error-correcting capability (VCSs- t EC) were
                 introduced for preventing the shadows carrying
                 additional information from being corrupted by noise
                 interference. However, the concerns on VCS- t EC, such
                 as the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:SSV,
  author =       "Yuqing Zhang and Yong Zhang and Shaofan Wang and Yun
                 Liang and Baocai Yin",
  title =        "Semi-supervised Video Object Segmentation Via an Edge
                 Attention Gated Graph Convolutional Network",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3611389",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3611389",
  abstract =     "Video object segmentation (VOS) exhibits heavy
                 occlusions, large deformation, and severe motion blur.
                 While many remarkable convolutional neural networks are
                 devoted to the VOS task, they often mis-identify
                 background noise as the target or output coarse
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wen:2024:VSI,
  author =       "Wenying Wen and Minghui Huang and Yushu Zhang and
                 Yuming Fang and Yifan Zuo",
  title =        "Visual Security Index Combining {CNN} and Filter for
                 Perceptually Encrypted Light Field Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3612924",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3612924",
  abstract =     "Visual security index (VSI) represents a quantitative
                 index for the visual security evaluation of
                 perceptually encrypted images. Recently, the research
                 on visual security of encrypted light field (LF) images
                 faces two challenges. One is that the existing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:CCS,
  author =       "Linlin Liu and Haijun Zhang and Qun Li and Jianghong
                 Ma and Zhao Zhang",
  title =        "Collocated Clothing Synthesis with {GANs} Aided by
                 Textual Information: a Multi-Modal Framework",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614097",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614097",
  abstract =     "Synthesizing realistic images of fashion items which
                 are compatible with given clothing images, as well as
                 conditioning on multiple modalities, brings novel and
                 exciting applications together with enormous economic
                 potential. In this work, we propose a \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lou:2024:SSC,
  author =       "Xulei Lou and Tinghui Wu and Haifeng Hu and Dihu
                 Chen",
  title =        "Self-Supervised Consistency Based on Joint Learning
                 for Unsupervised Person Re-identification",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "27:1--27:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3612926",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3612926",
  abstract =     "Recently, unsupervised domain adaptive person
                 re-identification (Re-ID) methods have been extensively
                 studied thanks to not requiring annotations, and they
                 have achieved excellent performance. Most of the
                 existing methods aim to train the Re-ID model for
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:CAP,
  author =       "Yichi Zhang and Gongchun Ding and Dandan Ding and Zhan
                 Ma and Zhu Li",
  title =        "On Content-Aware Post-Processing: Adapting
                 Statistically Learned Models to Dynamic Content",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "28:1--28:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3612925",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3612925",
  abstract =     "Learning-based post-processing methods generally
                 produce neural models that are statistically optimal on
                 their training datasets. These models, however, neglect
                 intrinsic variations of local video content and may
                 fail to process unseen content. To address \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xu:2024:DIC,
  author =       "Jing Xu and Bing Liu and Yong Zhou and Mingming Liu
                 and Rui Yao and Zhiwen Shao",
  title =        "Diverse Image Captioning via Conditional Variational
                 Autoencoder and Dual Contrastive Learning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614435",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614435",
  abstract =     "Diverse image captioning has achieved substantial
                 progress in recent years. However, the discriminability
                 of generative models and the limitation of cross
                 entropy loss are generally overlooked in the
                 traditional diverse image captioning models, which
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zou:2024:CLN,
  author =       "Cong Zou and Rui Wang and Cheng Jin and Sanyi Zhang
                 and Xin Wang",
  title =        "{S$^2$CL-LeafNet}: Recognizing Leaf Images
                 Like Human Botanists",
  journal =      j-TOMM,
  volume =       "20",
  number =       "1",
  pages =        "30:1--30:??",
  month =        jan,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3615659",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Sep 29 07:50:48 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3615659",
  abstract =     "Automatically classifying plant leaves is a
                 challenging fine-grained classification task because of
                 the diversity in leaf morphology, including size,
                 texture, shape, and venation. Although powerful deep
                 learning-based methods have achieved great \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Namasudra:2024:ISI,
  author =       "Suyel Namasudra and Pascal Lorenz and Seifedine Kadry
                 and Syed Ahmad Chan Bukhari",
  title =        "Introduction to the Special Issue on {DNA}-centric
                 Modeling and Practice for Next-generation Computing and
                 Communication Systems",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "31:1--31:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3578364",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3578364",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wan:2024:ESI,
  author =       "Shaohua Wan and Yi Jin and Guangdong Xu and Michele
                 Nappi",
  title =        "Editorial to Special Issue on Multimedia Cognitive
                 Computing for Intelligent Transportation System",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "32:1--32:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3604938",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3604938",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhao:2024:TEL,
  author =       "Ruonan Zhao and Laurence T. Yang and Debin Liu and
                 Wanli Lu and Chenlu Zhu and Yiheng Ruan",
  title =        "Tensor-Empowered {LSTM} for Communication-Efficient
                 and Privacy-Enhanced Cognitive Federated Learning in
                 Intelligent Transportation Systems",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "33:1--33:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3575661",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3575661",
  abstract =     "Multimedia cognitive computing as a revolutionary
                 emerging concept of artificial intelligence emulating
                 the reasoning process like human brains can facilitate
                 the evolution of intelligent transportation systems
                 (ITS) to be smarter, safer, and more \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2024:RSB,
  author =       "Hongjian Shi and Hao Wang and Ruhui Ma and Yang Hua
                 and Tao Song and Honghao Gao and Haibing Guan",
  title =        "Robust Searching-Based Gradient Collaborative
                 Management in Intelligent Transportation System",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "34:1--34:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3549939",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3549939",
  abstract =     "With the rapid development of big data and the
                 Internet of Things (IoT), traffic data from an
                 Intelligent Transportation System (ITS) is becoming
                 more and more accessible. To understand and simulate
                 the traffic patterns from the traffic data, Multimedia
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Weng:2024:HHC,
  author =       "Zejia Weng and Zuxuan Wu and Hengduo Li and Jingjing
                 Chen and Yu-Gang Jiang",
  title =        "{HCMS}: Hierarchical and Conditional Modality
                 Selection for Efficient Video Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "35:1--35:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572776",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572776",
  abstract =     "Videos are multimodal in nature. Conventional video
                 recognition pipelines typically fuse multimodal
                 features for improved performance. However, this is not
                 only computationally expensive but also neglects the
                 fact that different videos rely on different \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:DAS,
  author =       "Shixiong Zhang and Wenmin Wang and Honglei Li and
                 Shenyong Zhang",
  title =        "E-detector: Asynchronous Spatio-temporal for
                 Event-based Object Detection in Intelligent
                 Transportation System",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "36:1--36:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584361",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584361",
  abstract =     "In intelligent transportation systems, various
                 sensors, including radar and conventional frame
                 cameras, are used to improve system robustness in
                 various challenging scenarios. An event camera is a
                 novel bio-inspired sensor that has attracted the
                 interest \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Padhy:2024:MVA,
  author =       "Ram Prasad Padhy and Pankaj Kumar Sa and Fabio
                 Narducci and Carmen Bisogni and Sambit Bakshi",
  title =        "Monocular Vision-aided Depth Measurement from {RGB}
                 Images for Autonomous {UAV} Navigation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "37:1--37:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550485",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550485",
  abstract =     "Monocular vision-based 3D scene understanding has been
                 an integral part of many machine vision applications.
                 Always, the objective is to measure the depth using a
                 single RGB camera, which is at par with the depth
                 cameras. In this regard, monocular vision-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lv:2024:SID,
  author =       "Zhihan Lv and Fabio Poiesi and Qi Dong and Jaime
                 Lloret and Houbing Song",
  title =        "Special Issue on Deep Learning for Intelligent Human
                 Computer Interaction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "38:1--38:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605151",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605151",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gong:2024:MMM,
  author =       "Wenjuan Gong and Yue Zhang and Wei Wang and Peng Cheng
                 and Jordi Gonz{\`a}lez",
  title =        "{Meta-MMFNet}: Meta-learning-based Multi-model Fusion
                 Network for Micro-expression Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "39:1--39:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3539576",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3539576",
  abstract =     "Despite its wide applications in criminal
                 investigations and clinical communications with
                 patients suffering from autism, automatic
                 micro-expression recognition remains a challenging
                 problem because of the lack of training data and
                 imbalanced classes \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Djenouri:2024:EAG,
  author =       "Youcef Djenouri and Asma Belhadi and Gautam Srivastava
                 and Jerry Chun-Wei Lin",
  title =        "An Efficient and Accurate {GPU}-based Deep Learning
                 Model for Multimedia Recommendation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "40:1--40:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3524022",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3524022",
  abstract =     "This article proposes the use of deep learning in
                 human-computer interaction and presents a new
                 explainable hybrid framework for recommending relevant
                 hashtags on a set of orpheline tweets, which are tweets
                 with hashtags. The approach starts by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Loveleen:2024:EDH,
  author =       "Gaur Loveleen and Bhandari Mohan and Bhadwal Singh
                 Shikhar and Jhanjhi Nz and Mohammad Shorfuzzaman and
                 Mehedi Masud",
  title =        "Explanation-Driven {HCI} Model to Examine the
                 Mini-Mental State for {Alzheimer}'s Disease",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "41:1--41:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3527174",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3527174",
  abstract =     "Directing research on Alzheimer's disease toward only
                 early prediction and accuracy cannot be considered a
                 feasible approach toward tackling a ubiquitous
                 degenerative disease today. Applying deep learning
                 (DL), Explainable artificial intelligence, and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:AAD,
  author =       "Mi Li and Wei Zhang and Bin Hu and Jiaming Kang and
                 Yuqi Wang and Shengfu Lu",
  title =        "Automatic Assessment of Depression and Anxiety through
                 Encoding Pupil-wave from {HCI} in {VR} Scenes",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "42:1--42:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3513263",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3513263",
  abstract =     "At present, there have been many studies on the
                 methods of using the deep learning regression model to
                 assess depression level based on behavioral signals
                 (facial expression, speech, and language); however, the
                 research on the assessment method of anxiety \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qayyum:2024:SFB,
  author =       "Abdul Qayyum and Imran Razzak and M. Tanveer and Moona
                 Mazher",
  title =        "Spontaneous Facial Behavior Analysis Using Deep
                 Transformer-based Framework for Child-computer
                 Interaction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "43:1--43:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3539577",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3539577",
  abstract =     "A fascinating challenge in robotics-human interaction
                 is imitating the emotion recognition capability of
                 humans to robots with the aim to make human-robotics
                 interaction natural, genuine and intuitive. To achieve
                 the natural interaction in affective \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:FBH,
  author =       "Xiaowei Chen and Xiao Jiang and Lishuang Zhan and
                 Shihui Guo and Qunsheng Ruan and Guoliang Luo and
                 Minghong Liao and Yipeng Qin",
  title =        "Full-body Human Motion Reconstruction with Sparse
                 Joint Tracking Using Flexible Sensors",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "44:1--44:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3564700",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3564700",
  abstract =     "Human motion tracking is a fundamental building block
                 for various applications including computer animation,
                 human-computer interaction, healthcare, and so on. To
                 reduce the burden of wearing multiple sensors, human
                 motion prediction from sparse sensor \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qiao:2024:SSL,
  author =       "Shanbao Qiao and Neal N. Xiong and Yongbin Gao and
                 Zhijun Fang and Wenjun Yu and Juan Zhang and Xiaoyan
                 Jiang",
  title =        "Self-Supervised Learning of Depth and Ego-Motion for
                 {$3$D} Perception in Human Computer Interaction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "45:1--45:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588571",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588571",
  abstract =     "3D perception of depth and ego-motion is of vital
                 importance in intelligent agent and Human Computer
                 Interaction (HCI) tasks, such as robotics and
                 autonomous driving. There are different kinds of
                 sensors that can directly obtain 3D depth information.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Kang:2024:DGN,
  author =       "Yan Kang and Bin Pu and Yongqi Kou and Yun Yang and
                 Jianguo Chen and Khan Muhammad and Po Yang and Lida Xu
                 and Mohammad Hijji",
  title =        "A Deep Graph Network with Multiple Similarity for User
                 Clustering in Human-Computer Interaction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "46:1--46:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3549954",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3549954",
  abstract =     "User counterparts, such as user attributes in social
                 networks or user interests, are the keys to more
                 natural Human-Computer Interaction (HCI). In addition,
                 users' attributes and social structures help us
                 understand the complex interactions in HCI. Most
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Mahmud:2024:SHA,
  author =       "Bahar Mahmud and Guan Hong and Bernard Fong",
  title =        "A Study of Human--{AI} Symbiosis for Creative Work:
                 Recent Developments and Future Directions in Deep
                 Learning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "47:1--47:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3542698",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3542698",
  abstract =     "Recent advances in Artificial Intelligence (AI),
                 particularly deep learning, are having an enormous
                 impact on our society today. Record numbers of jobs
                 previously held by people have been automated, from
                 manufacturing to transportation to customer \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gu:2024:PPR,
  author =       "Xiaoling Gu and Jie Huang and Yongkang Wong and Jun Yu
                 and Jianping Fan and Pai Peng and Mohan S.
                 Kankanhalli",
  title =        "{PAINT}: Photo-realistic Fashion Design Synthesis",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "48:1--48:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3545610",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3545610",
  abstract =     "In this article, we investigate a new problem of
                 generating a variety of multi-view fashion designs
                 conditioned on a human pose and texture examples of
                 arbitrary sizes, which can replace the repetitive and
                 low-level design work for fashion designers. To
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dai:2024:UDA,
  author =       "Qingfeng Dai and Yongkang Wong and Guofei Sun and
                 Yanwei Wang and Zhou Zhou and Mohan S. Kankanhalli and
                 Xiangdong Li and Weidong Geng",
  title =        "Unsupervised Domain Adaptation by Causal Learning for
                 Biometric Signal-based {HCI}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "49:1--49:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583885",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583885",
  abstract =     "Biometric signal based human-computer interface (HCI)
                 has attracted increasing attention due to its wide
                 application in healthcare, entertainment,
                 neurocomputing, and so on. In recent years, deep
                 learning-based approaches have made great progress on
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiao:2024:RRD,
  author =       "Yi Xiao and Tong Liu and Yu Han and Yue Liu and
                 Yongtian Wang",
  title =        "Realtime Recognition of Dynamic Hand Gestures in
                 Practical Applications",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "50:1--50:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561822",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561822",
  abstract =     "Dynamic hand gesture acting as a semaphoric gesture is
                 a practical and intuitive mid-air gesture interface.
                 Nowadays benefiting from the development of deep
                 convolutional networks, the gesture recognition has
                 already achieved a high accuracy, however, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Gou:2024:HMA,
  author =       "Jianping Gou and Liyuan Sun and Baosheng Yu and
                 Shaohua Wan and Dacheng Tao",
  title =        "Hierarchical Multi-Attention Transfer for Knowledge
                 Distillation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "51:1--51:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3568679",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3568679",
  abstract =     "Knowledge distillation (KD) is a powerful and widely
                 applicable technique for the compression of deep
                 learning models. The main idea of knowledge
                 distillation is to transfer knowledge from a large
                 teacher model to a small student model, where the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Deb:2024:AIC,
  author =       "Subhrajyoti Deb and Abhilash Das and Nirmalya Kar",
  title =        "An Applied Image Cryptosystem on {Moore}'s Automaton
                 Operating on {$ \delta (q_k) / F_2 $}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "52:1--52:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614433",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614433",
  abstract =     "The volume of multimedia-based image data or video
                 frames in Web 3.0 is constantly increasing, owing to
                 the advancement of real-time data transmission.
                 However, security vulnerabilities frequently impair the
                 performance of real-time applications. Many \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{You:2024:IAV,
  author =       "Sisi You and Yukun Zuo and Hantao Yao and Changsheng
                 Xu",
  title =        "Incremental Audio-Visual Fusion for Person Recognition
                 in Earthquake Scene",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "53:1--53:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614434",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614434",
  abstract =     "Earthquakes have a profound impact on social harmony
                 and property, resulting in damage to buildings and
                 infrastructure. Effective earthquake rescue efforts
                 require rapid and accurate determination of whether any
                 survivors are trapped in the rubble of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Sun:2024:BSG,
  author =       "Shiqi Sun and Danlan Huang and Xiaoming Tao and
                 Chengkang Pan and Guangyi Liu and Changwen Chen",
  title =        "Boosting Scene Graph Generation with Contextual
                 Information",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "54:1--54:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3615868",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3615868",
  abstract =     "Scene graph generation (SGG) has been developed to
                 detect objects and their relationships from the visual
                 data and has attracted increasing attention in recent
                 years. Existing works have focused on extracting object
                 context for SGG. However, very few \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zheng:2024:CAG,
  author =       "Jianwei Zheng and Yu Liu and Yuchao Feng and Honghui
                 Xu and Meiyu Zhang",
  title =        "Contrastive Attention-guided Multi-level Feature
                 Registration for Reference-based Super-resolution",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "55:1--55:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3616495",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3616495",
  abstract =     "Given low-quality input and assisted by referential
                 images, reference-based super-resolution (RefSR)
                 strives to enlarge the spatial size with the guarantee
                 of realistic textures, for which sophisticated
                 feature-matching strategies are naturally demanded.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wu:2024:AAL,
  author =       "Shangxi Wu and Jitao Sang and Kaiyan Xu and Guanhua
                 Zheng and Changsheng Xu",
  title =        "Adaptive Adversarial Logits Pairing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "56:1--56:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3616375",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3616375",
  abstract =     "Adversarial examples provide an opportunity as well as
                 impose a challenge for understanding image
                 classification systems. Based on the analysis of the
                 adversarial training solution-Adversarial Logits
                 Pairing (ALP), we observed in this work that: (1) The
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:BBA,
  author =       "Ying Chen and Rui Yao and Yong Zhou and Jiaqi Zhao and
                 Bing Liu and Abdulmotaleb {El Saddik}",
  title =        "Black-box Attack against Self-supervised Video Object
                 Segmentation Models with Contrastive Loss",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "57:1--57:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617502",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617502",
  abstract =     "Deep learning models have been proven to be
                 susceptible to malicious adversarial attacks, which
                 manipulate input images to deceive the model into
                 making erroneous decisions. Consequently, the threat
                 posed to these models serves as a poignant reminder of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2024:RFO,
  author =       "Shuang Liang and Wentao Ma and Chi Xie",
  title =        "Relation with Free Objects for Action Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "58:1--58:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617596",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617596",
  abstract =     "Relevant objects are widely used for aiding human
                 action recognition in still images. Such objects are
                 founded by a dedicated and pre-trained object detector
                 in all previous methods. Such methods have two
                 drawbacks. First, training an object detector
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{He:2024:FMW,
  author =       "Qiaolin He and Zhijie Zheng and Haifeng Hu",
  title =        "A Feature Map is Worth a Video Frame: Rethinking
                 Convolutional Features for Visible-Infrared Person
                 Re-identification",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "59:1--59:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617375",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617375",
  abstract =     "Visible-Infrared Person Re-identification (VI-ReID)
                 aims to search for the identity of the same person
                 across different spectra. The feature maps obtained
                 from the convolutional layers are generally used for
                 loss calculation in the later stages of the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2024:GCL,
  author =       "Wuliang Huang and Yiqiang Chen and Xinlong Jiang and
                 Teng Zhang and Qian Chen",
  title =        "{GJFusion}: a Channel-Level Correlation Construction
                 Method for Multimodal Physiological Signal Fusion",
  journal =      j-TOMM,
  volume =       "20",
  number =       "2",
  pages =        "60:1--60:??",
  month =        feb,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617503",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Fri Nov 3 14:55:26 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617503",
  abstract =     "Physiological signal based ubiquitous computing has
                 garnered significant attention. However, the
                 heterogeneity among multimodal physiological signals
                 poses a critical challenge to practical applications.
                 To traverse this heterogeneity gap, recent studies
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shen:2024:SAC,
  author =       "Chengji Shen and Zhenjiang Liu and Xin Gao and Zunlei
                 Feng and Mingli Song",
  title =        "Self-Adaptive Clothing Mapping Based Virtual Try-on",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "61:1--61:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3613453",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3613453",
  abstract =     "VTON (Virtual Try-ON), as an innovative visual
                 application in e-commerce scenarios with great
                 commercial value, has been widely studied in recent
                 years. Due to its better robustness and realistic
                 effect, deformation-synthesize-based VTON has become
                 the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Baldrati:2024:CIR,
  author =       "Alberto Baldrati and Marco Bertini and Tiberio
                 Uricchio and Alberto {Del Bimbo}",
  title =        "Composed Image Retrieval using Contrastive Learning
                 and Task-oriented {CLIP}-based Features",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "62:1--62:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617597",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617597",
  abstract =     "Given a query composed of a reference image and a
                 relative caption, the Composed Image Retrieval goal is
                 to retrieve images visually similar to the reference
                 one that integrates the modifications expressed by the
                 caption. Given that recent research has \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:CMM,
  author =       "Yan Wang and Peize Li and Qingyi Si and Hanwen Zhang
                 and Wenyu Zang and Zheng Lin and Peng Fu",
  title =        "Cross-modality Multiple Relations Learning for
                 Knowledge-based Visual Question Answering",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "63:1--63:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3618301",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3618301",
  abstract =     "Knowledge-based visual question answering not only
                 needs to answer the questions based on images but also
                 incorporates external knowledge to study reasoning in
                 the joint space of vision and language. To bridge the
                 gap between visual content and semantic \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2024:IDB,
  author =       "Qiang Guo and Zhi Zhang and Mingliang Zhou and Hong
                 Yue and Huayan Pu and Jun Luo",
  title =        "Image Defogging Based on Regional Gradient Constrained
                 Prior",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "64:1--64:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617834",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617834",
  abstract =     "Foggy days limit the functionality of outdoor
                 surveillance systems. However, it is still a challenge
                 for existing methods to maintain the uniformity of
                 defogging between image regions with a similar depth of
                 field and large differences in appearance. To
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2024:PDP,
  author =       "Jintao Guo and Lei Qi and Yinghuan Shi and Yang Gao",
  title =        "{PLACE Dropout}: a Progressive Layer-wise and
                 Channel-wise Dropout for Domain Generalization",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "65:1--65:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624015",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624015",
  abstract =     "Domain generalization (DG) aims to learn a generic
                 model from multiple observed source domains that
                 generalizes well to arbitrary unseen target domains
                 without further training. The major challenge in DG is
                 that the model inevitably faces a severe \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "65",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiong:2024:VLS,
  author =       "Yuan Xiong and Jingru Wang and Zhong Zhou",
  title =        "{VirtualLoc}: Large-scale Visual Localization Using
                 Virtual Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "66:1--66:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3622788",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3622788",
  abstract =     "Robust and accurate camera pose estimation is
                 fundamental in computer vision. Learning-based
                 regression approaches acquire six-degree-of-freedom
                 camera parameters accurately from visual cues of an
                 input image. However, most are trained on street-view
                 and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "66",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:ECD,
  author =       "Yiheng Zhang and Ting Yao and Zhaofan Qiu and Tao
                 Mei",
  title =        "Explaining Cross-domain Recognition with Interpretable
                 Deep Classifier",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "67:1--67:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3623399",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3623399",
  abstract =     "The recent advances in deep learning predominantly
                 construct models in their internal representations, and
                 it is opaque to explain the rationale behind and
                 decisions to human users. Such explainability is
                 especially essential for domain adaptation, whose
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "67",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:AGM,
  author =       "Ruimin Wang and Fasheng Wang and Yiming Su and Jing
                 Sun and Fuming Sun and Haojie Li",
  title =        "Attention-guided Multi-modality Interaction Network
                 for {RGB-D} Salient Object Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "68:1--68:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624747",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624747",
  abstract =     "The past decade has witnessed great progress in RGB-D
                 salient object detection (SOD). However, there are two
                 bottlenecks that limit its further development. The
                 first one is low-quality depth maps. Most existing
                 methods directly use raw depth maps to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "68",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Rime:2024:HWY,
  author =       "Jemily Rime and Alan Archer-Boyd and Tom Collins",
  title =        "How Will You Pod? {Implications} of Creators'
                 Perspectives for Designing Innovative Podcasting
                 Tools",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "69:1--69:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625099",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625099",
  abstract =     "While centred on the medium of audio, podcasts are
                 often a multimedia concern, and one that has become
                 hugely popular in recent years, though relatively
                 little is known about the perspectives of podcast
                 creators and their visions of innovation. This
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "69",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cheung:2024:LPF,
  author =       "Ming Cheung",
  title =        "Learning from the Past: Fast {NAS} for Tasks and
                 Datasets",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "70:1--70:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3618000",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3618000",
  abstract =     "Nowadays, with the advancement of technology, many
                 retail companies require in-house data scientist teams
                 to build machine learning tasks, such as user
                 segmentation and item price prediction. These teams
                 typically use a trial-and-error process to obtain a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "70",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:UIQ,
  author =       "Xinyue Li and Haiyong Xu and Gangyi Jiang and Mei Yu
                 and Ting Luo and Xuebo Zhang and Hongwei Ying",
  title =        "Underwater Image Quality Assessment from Synthetic to
                 Real-world: Dataset and Objective Method",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "71:1--71:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624983",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624983",
  abstract =     "The complicated underwater environment and lighting
                 conditions lead to severe influence on the quality of
                 underwater imaging, which tends to impair underwater
                 exploration and research. To effectively evaluate the
                 quality of underwater images, an \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "71",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hou:2024:DLL,
  author =       "Sujuan Hou and Jiacheng Li and Weiqing Min and Qiang
                 Hou and Yanna Zhao and Yuanjie Zheng and Shuqiang
                 Jiang",
  title =        "Deep Learning for Logo Detection: a Survey",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "72:1--72:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3611309",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3611309",
  abstract =     "Logo detection has gradually become a research hotspot
                 in the field of computer vision and multimedia for its
                 various applications, such as social media monitoring,
                 intelligent transportation, and video advertising
                 recommendation. Recent advances in this \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "72",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Peng:2024:DLB,
  author =       "Yunjie Peng and Jinlin Wu and Boqiang Xu and Chunshui
                 Cao and Xu Liu and Zhenan Sun and Zhiqiang He",
  title =        "Deep Learning Based Occluded Person Re-Identification:
                 a Survey",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "73:1--73:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3610534",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3610534",
  abstract =     "Occluded person re-identification (Re-ID) focuses on
                 addressing the occlusion problem when retrieving the
                 person of interest across non-overlapping cameras. With
                 the increasing demand for intelligent video
                 surveillance and the application of person Re-ID
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "73",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Manzoor:2024:MRL,
  author =       "Muhammad Arslan Manzoor and Sarah Albarri and Ziting
                 Xian and Zaiqiao Meng and Preslav Nakov and Shangsong
                 Liang",
  title =        "Multimodality Representation Learning: a Survey on
                 Evolution, Pretraining and Its Applications",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "74:1--74:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617833",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617833",
  abstract =     "Multimodality Representation Learning, as a technique
                 of learning to embed information from different
                 modalities and their correlations, has achieved
                 remarkable success on a variety of applications, such
                 as Visual Question Answering (VQA), Natural \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "74",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2024:BFS,
  author =       "Yanyan Shi and Shaowu Yang and Wenjing Yang and Dianxi
                 Shi and Xuehui Li",
  title =        "Boosting Few-shot Object Detection with Discriminative
                 Representation and Class Margin",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "75:1--75:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3608478",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3608478",
  abstract =     "Classifying and accurately locating a visual category
                 with few annotated training samples in computer vision
                 has motivated the few-shot object detection technique,
                 which exploits transferring the source-domain detection
                 model to the target domain. Under \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "75",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Cheng:2024:VFH,
  author =       "Harry Cheng and Yangyang Guo and Tianyi Wang and Qi Li
                 and Xiaojun Chang and Liqiang Nie",
  title =        "Voice-Face Homogeneity Tells Deepfake",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "76:1--76:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625231",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625231",
  abstract =     "Detecting forgery videos is highly desirable due to
                 the abuse of deepfake. Existing detection approaches
                 contribute to exploring the specific artifacts in
                 deepfake videos and fit well on certain data. However,
                 the growing technique on these artifacts \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "76",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ye:2024:VSA,
  author =       "Jin Ye and Meng Dan and Wenchao Jiang",
  title =        "A Visual Sensitivity Aware {ABR} Algorithm for {DASH}
                 via Deep Reinforcement Learning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "77:1--77:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3591108",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3591108",
  abstract =     "In order to cope with the fluctuation of network
                 bandwidth and provide smooth video services, adaptive
                 video streaming technology is proposed. In particular,
                 the adaptive bitrate (ABR) algorithm is widely used in
                 dynamic adaptive streaming over HTTP (DASH). \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "77",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:TRH,
  author =       "Jian Wang and Xiao Wang and Guosheng Zhao",
  title =        "Task Recommendation via Heterogeneous Multi-modal
                 Features and Decision Fusion in Mobile Crowdsensing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "78:1--78:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3626239",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3626239",
  abstract =     "In the decision-making process of the behavior of
                 mobile crowdsensing, using a single view to learn a
                 user's preference will lead to a mismatch between the
                 user's wishes and the final task recommendation list,
                 resulting in the low efficiency of the model \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "78",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Lei:2024:BDV,
  author =       "Si-Chao Lei and Yue-Jiao Gong and Xiao-Lin Xiao and
                 Yi-cong Zhou and Jun Zhang",
  title =        "Boosting Diversity in Visual Search with {Pareto}
                 Non-Dominated Re-Ranking",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "79:1--79:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625296",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625296",
  abstract =     "The field of visual search has gained significant
                 attention recently, particularly in the context of web
                 search engines and e-commerce product search platforms.
                 However, the abundance of web images presents a
                 challenge for modern image retrieval systems,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "79",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:ISS,
  author =       "Huijie Zhang and Pu Li and Xiaobai Liu and Xianfeng
                 Yang and Li An",
  title =        "An Iterative Semi-supervised Approach with Pixel-wise
                 Contrastive Loss for Road Extraction in Aerial Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "80:1--80:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3606374",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3606374",
  abstract =     "Extracting roads in aerial images has numerous
                 applications in artificial intelligence and multimedia
                 computing, including traffic pattern analysis and
                 parking space planning. Learning deep neural networks,
                 though very successful, demand vast amounts of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "80",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fang:2024:IAS,
  author =       "Jing Fang and Yinbo Yu and Zhongyuan Wang and Xin Ding
                 and Ruimin Hu",
  title =        "An Image Arbitrary-Scale Super-Resolution Network
                 Using Frequency-domain Information",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "81:1--81:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3616376",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3616376",
  abstract =     "Image super-resolution (SR) is a technique to recover
                 lost high-frequency information in low-resolution (LR)
                 images. Since spatial-domain information has been
                 widely exploited, there is a new trend to involve
                 frequency-domain information in SR tasks. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "81",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Luo:2024:TES,
  author =       "Xiao Luo and Wei Ju and Yiyang Gu and Yifang Qin and
                 Siyu Yi and Daqing Wu and Luchen Liu and Ming Zhang",
  title =        "Toward Effective Semi-supervised Node Classification
                 with Hybrid Curriculum Pseudo-labeling",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "82:1--82:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3626528",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3626528",
  abstract =     "Semi-supervised node classification is a crucial
                 challenge in relational data mining and has attracted
                 increasing interest in research on graph neural
                 networks (GNNs). However, previous approaches merely
                 utilize labeled nodes to supervise the overall
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "82",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Guo:2024:FDN,
  author =       "Wen Guo and Wuzhou Quan and Junyu Gao and Tianzhu
                 Zhang and Changsheng Xu",
  title =        "Feature Disentanglement Network: Multi-Object Tracking
                 Needs More Differentiated Features",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "83:1--83:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3626825",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3626825",
  abstract =     "To reduce computational redundancies, a common
                 approach is to integrate detection and
                 re-identification (Re-ID) into a single network in
                 multi-object tracking (MOT), referred to as ``tracking
                 by detection.'' Most of the previous research has
                 focused on \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "83",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Khaleel:2024:VVC,
  author =       "Mohammed Khaleel and Azeez Idris and Wallapak
                 Tavanapong and Jacob R. Pratt and Junghwan Oh and Piet
                 C. de Groen",
  title =        "{VisActive}: Visual-concept-based Active Learning for
                 Image Classification under Class Imbalance",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "84:1--84:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617999",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617999",
  abstract =     "Active learning methods recommend the most informative
                 images from a large unlabeled dataset for manual
                 labeling. These methods improve the performance of an
                 image classifier while minimizing manual labeling
                 efforts. We propose VisActive, a visual-concept-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "84",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:GLB,
  author =       "Honghua Chen and Zhiqi Li and Mingqing Wei and Jun
                 Wang",
  title =        "Geometric and Learning-Based Mesh Denoising: a
                 Comprehensive Survey",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "85:1--85:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625098",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625098",
  abstract =     "Mesh denoising is a fundamental problem in digital
                 geometry processing. It seeks to remove surface noise
                 while preserving surface intrinsic signals as
                 accurately as possible. While traditional wisdom has
                 been built upon specialized priors to smooth \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "85",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2024:BNL,
  author =       "Ning Han and Yawen Zeng and Chuhao Shi and Guangyi
                 Xiao and Hao Chen and Jingjing Chen",
  title =        "{BiC-Net}: Learning Efficient Spatio-temporal Relation
                 for Text-Video Retrieval",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "86:1--86:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3627103",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3627103",
  abstract =     "The task of text-video retrieval aims to understand
                 the correspondence between language and vision and has
                 gained increasing attention in recent years. Recent
                 works have demonstrated the superiority of local
                 spatio-temporal relation learning with graph-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "86",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2024:ADD,
  author =       "Yuan Feng and Yaojun Hu and Pengfei Fang and Sheng Liu
                 and Yanhong Yang and Shengyong Chen",
  title =        "Asymmetric Dual-Decoder {U-Net} for Joint Rain and
                 Haze Removal",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "87:1--87:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3628451",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3628451",
  abstract =     "This work studies the multi-weather restoration
                 problem. In real-life scenarios, rain and haze, two
                 often co-occurring common weather phenomena, can
                 greatly degrade the clarity and quality of the scene
                 images, leading to a performance drop in the visual
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "87",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xie:2024:SGD,
  author =       "Yurui Xie and Ling Guan",
  title =        "Sparsity-guided Discriminative Feature Encoding for
                 Robust Keypoint Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "88:1--88:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3628432",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3628432",
  abstract =     "Existing handcrafted keypoint detectors typically
                 focus on designing specific local structures manually
                 while ignoring whether they have enough flexibility to
                 explore diverse visual patterns in an image. Despite
                 the advancement of learning-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "88",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Beuve:2024:HLD,
  author =       "Nicolas Beuve and Wassim Hamidouche and Olivier
                 D{\'e}forges",
  title =        "Hierarchical Learning and Dummy Triplet Loss for
                 Efficient Deepfake Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "89:1--89:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3626101",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3626101",
  abstract =     "The advancement of generative models has made it
                 easier to create highly realistic Deepfake videos. This
                 accessibility has led to a surge in research on
                 Deepfake detection to mitigate potential misuse.
                 Typically, Deepfake detection models utilize binary
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "89",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Xiang:2024:RPR,
  author =       "Suncheng Xiang and Dahong Qian and Jingsheng Gao and
                 Zirui Zhang and Ting Liu and Yuzhuo Fu",
  title =        "Rethinking Person Re-Identification via Semantic-based
                 Pretraining",
  journal =      j-TOMM,
  volume =       "20",
  number =       "3",
  pages =        "90:1--90:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3628452",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Thu Dec 21 10:47:32 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3628452",
  abstract =     "Pretraining is a dominant paradigm in computer vision.
                 Generally, supervised ImageNet pretraining is commonly
                 used to initialize the backbones of person
                 re-identification (Re-ID) models. However, recent works
                 show a surprising result that CNN-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "90",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Peng:2024:HSE,
  author =       "Min Peng and Xiaohu Shao and Yu Shi and Xiangdong
                 Zhou",
  title =        "Hierarchical Synergy-Enhanced Multimodal Relational
                 Network for Video Question Answering",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "91:1--91:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630101",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630101",
  abstract =     "Video question answering (VideoQA) is challenging as
                 it requires reasoning about natural language and
                 multimodal interactive relations. Most existing methods
                 apply attention mechanisms to extract interactions
                 between the question and the video or to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "91",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ren:2024:CIT,
  author =       "Bin Ren and Hao Tang and Fanyang Meng and Ding Runwei
                 and Philip H. S. Torr and Nicu Sebe",
  title =        "Cloth Interactive Transformer for Virtual Try-On",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "92:1--92:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617374",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617374",
  abstract =     "The 2D image-based virtual try-on has aroused
                 increased interest from the multimedia and computer
                 vision fields due to its enormous commercial value.
                 Nevertheless, most existing image-based virtual try-on
                 approaches directly combine the person-identity
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "92",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Nie:2024:CSI,
  author =       "Xiushan Nie and Yang Shi and Ziyu Meng and Jin Huang
                 and Weili Guan and Yilong Yin",
  title =        "Complex Scenario Image Retrieval via Deep
                 Similarity-aware Hashing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "93:1--93:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624016",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624016",
  abstract =     "When performing hashing-based image retrieval, it is
                 difficult to learn discriminative hash codes especially
                 for the multi-label, zero-shot and fine-grained
                 settings. This is due to the fact that the similarities
                 vary, even within the same category, under \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "93",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Tan:2024:CLS,
  author =       "Jiawei Tan and Hongxing Wang and Junsong Yuan",
  title =        "Characters Link Shots: Character Attention Network for
                 Movie Scene Segmentation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "94:1--94:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630257",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630257",
  abstract =     "Movie scene segmentation aims to automatically segment
                 a movie into multiple story units, i.e., scenes, each
                 of which is a series of semantically coherent and
                 time-continual shots. Previous methods have continued
                 efforts on shot semantic association, but \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "94",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2024:RRT,
  author =       "Mingliang Zhou and Xinwen Zhao and Futing Luo and Jun
                 Luo and Huayan Pu and Tao Xiang",
  title =        "Robust {RGB-T} Tracking via Adaptive Modality Weight
                 Correlation Filters and Cross-modality Learning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "95:1--95:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630100",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630100",
  abstract =     "RGBT tracking is gaining popularity due to its ability
                 to provide effective tracking results in a variety of
                 weather conditions. However, feature specificity and
                 complementarity have not been fully used in existing
                 models that directly fuse the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "95",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:SOQ,
  author =       "Zicheng Zhang and Wei Sun and Yingjie Zhou and Jun Jia
                 and Zhichao Zhang and Jing Liu and Xiongkuo Min and
                 Guangtao Zhai",
  title =        "Subjective and Objective Quality Assessment for
                 in-the-Wild Computer Graphics Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "96:1--96:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631357",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631357",
  abstract =     "Computer graphics images (CGIs) are artificially
                 generated by means of computer programs and are widely
                 perceived under various scenarios, such as games,
                 streaming media, etc. In practice, the quality of CGIs
                 consistently suffers from poor rendering \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "96",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Roy:2024:CLV,
  author =       "Shuvendu Roy and Ali Etemad",
  title =        "Contrastive Learning of View-invariant Representations
                 for Facial Expressions Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "97:1--97:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632960",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632960",
  abstract =     "Although there has been much progress in the area of
                 facial expression recognition (FER), most existing
                 methods suffer when presented with images that have
                 been captured from viewing angles that are non-frontal
                 and substantially different from those used \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "97",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:GRA,
  author =       "Jun Liu and Jiantao Zhou and Haiwei Wu and Weiwei Sun
                 and Jinyu Tian",
  title =        "Generating Robust Adversarial Examples against Online
                 Social Networks {(OSNs)}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "98:1--98:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632528",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632528",
  abstract =     "Online Social Networks (OSNs) have blossomed into
                 prevailing transmission channels for images in the
                 modern era. Adversarial examples (AEs) deliberately
                 designed to mislead deep neural networks (DNNs) are
                 found to be fragile against the inevitable lossy
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "98",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yao:2024:CMS,
  author =       "Tao Yao and Yiru Li and Ying Li and Yingying Zhu and
                 Gang Wang and Jun Yue",
  title =        "Cross-modal Semantically Augmented Network for
                 Image-text Matching",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "99:1--99:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631356",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631356",
  abstract =     "Image-text matching plays an important role in solving
                 the problem of cross-modal information processing.
                 Since there are nonnegligible semantic differences
                 between heterogeneous pairwise data, a crucial
                 challenge is how to learn a unified
                 representation. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "99",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Telili:2024:DBL,
  author =       "Ahmed Telili and Sid Ahmed Fezza and Wassim Hamidouche
                 and Hanene F. Z. Brachemi Meftah",
  title =        "{2BiVQA}: Double Bi-{LSTM}-based Video Quality
                 Assessment of {UGC} Videos",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "100:1--100:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632178",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632178",
  abstract =     "Recently, with the growing popularity of mobile
                 devices as well as video sharing platforms (e.g.,
                 YouTube, Facebook, TikTok, and Twitch), User-Generated
                 Content (UGC) videos have become increasingly common
                 and now account for a large portion of multimedia
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "100",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:WMS,
  author =       "Hongzhou Chen and Haihan Duan and Maha Abdallah and
                 Yufeng Zhu and Yonggang Wen and Abdulmotaleb {El
                 Saddik} and Wei Cai",
  title =        "{Web3 Metaverse}: State-of-the-Art and Vision",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "101:1--101:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630258",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630258",
  abstract =     "The metaverse, as a rapidly evolving socio-technical
                 phenomenon, exhibits significant potential across
                 diverse domains by leveraging Web3 (a.k.a. Web 3.0)
                 technologies such as blockchain, smart contracts, and
                 non-fungible tokens (NFTs). This survey aims \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "101",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:GBC,
  author =       "Lilong Wang and Yunhui Shi and Jin Wang and Shujun
                 Chen and Baocai Yin and Nam Ling",
  title =        "Graph Based Cross-Channel Transform for Color Image
                 Compression",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "102:1--102:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3631710",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3631710",
  abstract =     "Adaptive transform coding is gaining more and more
                 attention for better mining of image content over fixed
                 transforms such as discrete cosine transform (DCT). As
                 a special case, graph transform learning establishes a
                 novel paradigm for the graph-based \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "102",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Han:2024:SHO,
  author =       "Kai Han and Yu Liu and Rukai Wei and Ke Zhou and
                 Jinhui Xu and Kun Long",
  title =        "Supervised Hierarchical Online Hashing for Cross-modal
                 Retrieval",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "103:1--103:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632527",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632527",
  abstract =     "Online cross-modal hashing has gained attention for
                 its adaptability in processing streaming data. However,
                 existing methods only define the hard similarity
                 between data using labels. This results in poor
                 retrieval performance, as they fail to exploit the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "103",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Fu:2024:SOT,
  author =       "Fengyi Fu and Shancheng Fang and Weidong Chen and
                 Zhendong Mao",
  title =        "Sentiment-Oriented Transformer-Based Variational
                 Autoencoder Network for Live Video Commenting",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "104:1--104:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633334",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633334",
  abstract =     "Automatic live video commenting is getting increasing
                 attention due to its significance in narration
                 generation, topic explanation, etc. However, the
                 diverse sentiment consideration of the generated
                 comments is missing from current methods. Sentimental
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "104",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Peng:2024:JCJ,
  author =       "Yuxiang Peng and Chong Fu and Guixing Cao and Wei Song
                 and Junxin Chen and Chiu-Wing Sham",
  title =        "{JPEG}-compatible Joint Image Compression and
                 Encryption Algorithm with File Size Preservation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "105:1--105:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633459",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633459",
  abstract =     "Joint image compression and encryption algorithms are
                 intensively investigated due to their powerful
                 capability of simultaneous image data compression and
                 sensitive information protection. Unfortunately, most
                 of the existing algorithms suffered from \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "105",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:TEC,
  author =       "Daizong Liu and Xiaoye Qu and Jianfeng Dong and Pan
                 Zhou and Zichuan Xu and Haozhao Wang and Xing Di and
                 Weining Lu and Yu Cheng",
  title =        "Transform-Equivariant Consistency Learning for
                 Temporal Sentence Grounding",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "106:1--106:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634749",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634749",
  abstract =     "This paper addresses the temporal sentence grounding
                 (TSG). Although existing methods have made decent
                 achievements in this task, they not only severely rely
                 on abundant video-query paired data for training, but
                 also easily fail into the dataset \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "106",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2024:STR,
  author =       "Yijie Hu and Bin Dong and Kaizhu Huang and Lei Ding
                 and Wei Wang and Xiaowei Huang and Qiu-Feng Wang",
  title =        "Scene Text Recognition via Dual-path Network with
                 Shape-driven Attention Alignment",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "107:1--107:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633517",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633517",
  abstract =     "Scene text recognition (STR), one typical
                 sequence-to-sequence problem, has drawn much attention
                 recently in multimedia applications. To guarantee good
                 performance, it is essential for STR to obtain aligned
                 character-wise features from the whole-image \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "107",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liang:2024:NHN,
  author =       "Rongjiao Liang and Shichao Zhang and Wenzhen Zhang and
                 Guixian Zhang and Jinyun Tang",
  title =        "Nonlocal Hybrid Network for Long-tailed Image
                 Classification",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "108:1--108:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630256",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630256",
  abstract =     "It is a significant issue to deal with long-tailed
                 data when classifying images. A nonlocal hybrid network
                 (NHN) that takes into account both feature learning and
                 classifier learning is proposed. The NHN can capture
                 the existence of dependencies between \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "108",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2024:DMC,
  author =       "Piao Shi and Min Hu and Xuefeng Shi and Fuji Ren",
  title =        "Deep Modular Co-Attention Shifting Network for
                 Multimodal Sentiment Analysis",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "109:1--109:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634706",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634706",
  abstract =     "Human Multimodal Sentiment Analysis (MSA) is an
                 attractive research that studies sentiment expressed
                 from multiple heterogeneous modalities. While
                 transformer-based methods have achieved great success,
                 designing an effective ``co-attention'' model to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "109",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:VLS,
  author =       "Jing Zhang and Dan Guo and Xun Yang and Peipei Song
                 and Meng Wang",
  title =        "Visual-linguistic-stylistic Triple Reward for
                 Cross-lingual Image Captioning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "110:1--110:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634917",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634917",
  abstract =     "Generating image captions in different languages is
                 worth exploring and essential for non-native speakers.
                 Nevertheless, collecting paired annotation for every
                 language is time-consuming and impractical,
                 particularly for minor languages. To this end, the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "110",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Jia:2024:ENC,
  author =       "Zhaoyang Jia and Yan Lu and Houqiang Li",
  title =        "Exploring Neighbor Correspondence Matching for
                 Multiple-hypotheses Video Frame Synthesis",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "111:1--111:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633780",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633780",
  abstract =     "Video frame synthesis, which consists of interpolation
                 and extrapolation, is an essential video processing
                 technique that can be applied to various scenarios.
                 However, most existing methods cannot handle small
                 objects or large motion well, especially in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "111",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhou:2024:GPI,
  author =       "Sheng Zhou and Dan Guo and Xun Yang and Jianfeng Dong
                 and Meng Wang",
  title =        "Graph Pooling Inference Network for Text-based {VQA}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "112:1--112:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634918",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634918",
  abstract =     "Effectively leveraging objects and optical character
                 recognition (OCR) tokens to reason out pivotal scene
                 text is critical for the challenging Text-based Visual
                 Question Answering (TextVQA) task. Graph-based models
                 can effectively capture the semantic \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "112",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hu:2024:OBS,
  author =       "Hengtong Hu and Lingxi Xie and Xinyue Huo and Richang
                 Hong and Qi Tian",
  title =        "One-Bit Supervision for Image Classification: Problem,
                 Solution, and Beyond",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "113:1--113:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633779",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633779",
  abstract =     "This article presents one-bit supervision, a novel
                 setting of learning with fewer labels, for image
                 classification. Instead of the training model using the
                 accurate label of each sample, our setting requires the
                 model to interact with the system by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "113",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yuan:2024:DCB,
  author =       "Hang Yuan and Wei Gao and Siwei Ma and Yiqiang Yan",
  title =        "Divide-and-conquer-based {RDO}-free {CU} Partitioning
                 for {8K} Video Compression",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "114:1--114:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634705",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634705",
  abstract =     "8K (7689$ \times $4320) ultra-high definition (UHD)
                 videos are growing popular with the improvement of
                 human visual experience demand. Therefore, the
                 compression of 8K UHD videos has become a top priority
                 in the third-generation audio video coding standard
                 (AVS3). \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "114",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:DWA,
  author =       "Mingyu Li and Tao Zhou and Zhuo Huang and Jian Yang
                 and Jie Yang and Chen Gong",
  title =        "Dynamic Weighted Adversarial Learning for
                 Semi-Supervised Classification under Intersectional
                 Class Mismatch",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "115:1--115:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635310",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635310",
  abstract =     "Nowadays, class-mismatch problem has drawn intensive
                 attention in Semi-Supervised Learning (SSL), where the
                 classes of labeled data are assumed to be only a subset
                 of the classes of unlabeled data. However, in a more
                 realistic scenario, the labeled data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "115",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2024:SLC,
  author =       "Hui Huang and Di Xiao and Jia Liang",
  title =        "Secure Low-complexity Compressive Sensing with
                 Preconditioning Prior Regularization Reconstruction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "116:1--116:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635308",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635308",
  abstract =     "Compressive sensing (CS), a breakthrough technology in
                 image processing, provides a privacy-preserving layer
                 and image reconstruction while performing sensing and
                 recovery processes, respectively. Unfortunately, it
                 still faces high-complexity, low-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "116",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Clement:2024:SDH,
  author =       "Nathan Clement and Alan Schoen and Arnold Boedihardjo
                 and Andrew Jenkins",
  title =        "Synthetic Data and Hierarchical Object Detection in
                 Overhead Imagery",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "117:1--117:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635309",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635309",
  abstract =     "The performance of neural network models is often
                 limited by the availability of big datasets. To treat
                 this problem, we survey and develop novel synthetic
                 data generation and augmentation techniques for
                 enhancing low/zero-sample learning in satellite
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "117",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bian:2024:PAL,
  author =       "Jiang Bian and Xuhong Li and Tao Wang and Qingzhong
                 Wang and Jun Huang and Chen Liu and Jun Zhao and
                 Feixiang Lu and Dejing Dou and Haoyi Xiong",
  title =        "{P$^2$ANet}: a Large-Scale Benchmark for Dense Action
                 Detection from Table Tennis Match Broadcasting Videos",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "118:1--118:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633516",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633516",
  abstract =     "While deep learning has been widely used for video
                 analytics, such as video classification and action
                 detection, dense action detection with fast-moving
                 subjects from sports videos is still challenging. In
                 this work, we release yet another sports video
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "118",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2024:AIG,
  author =       "Jifan Yang and Zhongyuan Wang and Guangcheng Wang and
                 Baojin Huang and Yuhong Yang and Weiping Tu",
  title =        "Auxiliary Information Guided Self-attention for Image
                 Quality Assessment",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "119:1--119:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635716",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635716",
  abstract =     "Image quality assessment (IQA) is an important problem
                 in computer vision with many applications. We propose a
                 transformer-based multi-task learning framework for the
                 IQA task. Two subtasks: constructing an auxiliary
                 information error map and completing \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "119",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Feng:2024:EVT,
  author =       "Zhanzhou Feng and Jiaming Xu and Lei Ma and Shiliang
                 Zhang",
  title =        "Efficient Video Transformers via Spatial-temporal
                 Token Merging for Action Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "4",
  pages =        "120:1--120:??",
  month =        apr,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633781",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Sat Jan 13 15:13:22 MST 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633781",
  abstract =     "Transformer has exhibited promising performance in
                 various video recognition tasks but brings a huge
                 computational cost in modeling spatial-temporal cues.
                 This work aims to boost the efficiency of existing
                 video transformers for action recognition through
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "120",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:PCA,
  author =       "Shupei Zhang and Chenqiu Zhao and Anup Basu",
  title =        "Principal Component Approximation Network for Image
                 Compression",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "121:1--121:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637490",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637490",
  abstract =     "In this work, we propose a novel principal component
                 approximation network (PCANet) for image compression.
                 The proposed network is based on the assumption that a
                 set of images can be decomposed into several shared
                 feature matrices, and an image can be \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "121",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:TEC,
  author =       "Tianyu Zhang and Weiqing Min and Tao Liu and Shuqiang
                 Jiang and Yong Rui",
  title =        "Toward Egocentric Compositional Action Anticipation
                 with Adaptive Semantic Debiasing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "122:1--122:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3633333",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3633333",
  abstract =     "Predicting the unknown from the first-person
                 perspective is expected as a necessary step toward
                 machine intelligence, which is essential for practical
                 applications including autonomous driving and robotics.
                 As a human-level task, egocentric action \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "122",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:AVT,
  author =       "Yu Liu and Mingbo Zhao and Zhao Zhang and Yuping Liu
                 and Shuicheng Yan",
  title =        "Arbitrary Virtual Try-on Network: Characteristics
                 Preservation and Tradeoff between Body and Clothing",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "123:1--123:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3636426",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3636426",
  abstract =     "Deep learning based virtual try-on system has achieved
                 some encouraging progress recently, but there still
                 remain several big challenges that need to be solved,
                 such as trying on arbitrary clothes of all types,
                 trying on the clothes from one category to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "123",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yang:2024:CCM,
  author =       "Shih-Wei Yang and Li-Hsiang Shen and Hong-Han Shuai
                 and Kai-Ten Feng",
  title =        "{CMAF}: Cross-Modal Augmentation via Fusion for
                 Underwater Acoustic Image Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "124:1--124:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3636427",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3636427",
  abstract =     "Underwater image recognition is crucial for underwater
                 detection applications. Fish classification has been
                 one of the emerging research areas in recent years.
                 Existing image classification models usually classify
                 data collected from terrestrial \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "124",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:SAR,
  author =       "Yazhou Zhang and Yang Yu and Mengyao Wang and Min
                 Huang and M. Shamim Hossain",
  title =        "Self-Adaptive Representation Learning Model for
                 Multi-Modal Sentiment and Sarcasm Joint Analysis",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "125:1--125:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635311",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635311",
  abstract =     "Sentiment and sarcasm are intimate and complex, as
                 sarcasm often deliberately elicits an emotional
                 response in order to achieve its specific purpose.
                 Current challenges in multi-modal sentiment and sarcasm
                 joint detection mainly include multi-modal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "125",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qi:2024:DSD,
  author =       "Lei Qi and Peng Dong and Tan Xiong and Hui Xue and Xin
                 Geng",
  title =        "{DoubleAUG}: Single-domain Generalized Object Detector
                 in Urban via Color Perturbation and Dual-style Memory",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "126:1--126:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634683",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634683",
  abstract =     "Object detection in urban scenarios is crucial for
                 autonomous driving in intelligent traffic systems.
                 However, unlike conventional object detection tasks,
                 urban-scene images vary greatly in style. For example,
                 images taken on sunny days differ \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "126",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Shi:2024:ICM,
  author =       "Dan Shi and Lei Zhu and Jingjing Li and Guohua Dong
                 and Huaxiang Zhang",
  title =        "Incomplete Cross-Modal Retrieval with Deep Correlation
                 Transfer",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "127:1--127:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637442",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637442",
  abstract =     "Most cross-modal retrieval methods assume the
                 multi-modal training data is complete and has a
                 one-to-one correspondence. However, in the real world,
                 multi-modal data generally suffers from missing
                 modality information due to the uncertainty of data
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "127",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zeng:2024:MPS,
  author =       "Xianhua Zeng and Xinyu Wang and Yicai Xie",
  title =        "Multiple Pseudo-{Siamese} Network with Supervised
                 Contrast Learning for Medical Multi-modal Retrieval",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "128:1--128:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637441",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637441",
  abstract =     "Medical multi-modal retrieval aims to provide doctors
                 with similar medical images from different modalities,
                 which can greatly promote the efficiency and accuracy
                 of clinical diagnosis. However, most existing medical
                 retrieval methods hardly support the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "128",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{You:2024:MOT,
  author =       "Sisi You and Hantao Yao and Bing-Kun Bao and
                 Changsheng Xu",
  title =        "Multi-object Tracking with Spatial-Temporal Tracklet
                 Association",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "129:1--129:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635155",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635155",
  abstract =     "Recently, the tracking-by-detection methods have
                 achieved excellent performance in Multi-Object Tracking
                 (MOT), which focuses on obtaining a robust feature for
                 each object and generating tracklets based on feature
                 similarity. However, they are confronted \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "129",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Bingol:2024:QEW,
  author =       "G{\"u}lnaziye Bing{\"o}l and Simone Porcu and
                 Alessandro Floris and Luigi Atzori",
  title =        "{QoE} Estimation of {WebRTC}-based Audio-visual
                 Conversations from Facial and Speech Features",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "130:1--130:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638251",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638251",
  abstract =     "The utilization of user's facial- and speech-related
                 features for the estimation of the Quality of
                 Experience (QoE) of multimedia services is still
                 underinvestigated despite its potential. Currently,
                 only the use of either facial or speech features
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "130",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Qiu:2024:LOP,
  author =       "Heqian Qiu and Hongliang Li and Qingbo Wu and Hengcan
                 Shi and Lanxiao Wang and Fanman Meng and Linfeng Xu",
  title =        "Learning Offset Probability Distribution for Accurate
                 Object Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "131:1--131:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3637214",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3637214",
  abstract =     "Object detection combines object classification and
                 object localization problems. Current object detection
                 methods heavily depend on regression networks to locate
                 objects, which are optimized with various regression
                 loss functions to predict offsets \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "131",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Floris:2024:CMP,
  author =       "Alessandro Floris and Simone Porcu and Luigi Atzori",
  title =        "Controlling Media Player with Hands: a Transformer
                 Approach and a Quality of Experience Assessment",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "132:1--132:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638560",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638560",
  abstract =     "In this article, we propose a Hand Gesture Recognition
                 (HGR) system based on a novel deep transformer (DT)
                 neural network for media player control. The extracted
                 hand skeleton features are processed by separate
                 transformers for each finger in isolation to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "132",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:EVR,
  author =       "Jingyu Li and Zhendong Mao and Hao Li and Weidong Chen
                 and Yongdong Zhang",
  title =        "Exploring Visual Relationships via Transformer-based
                 Graphs for Enhanced Image Captioning",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "133:1--133:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638558",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638558",
  abstract =     "Image captioning (IC), bringing vision to language,
                 has drawn extensive attention. A crucial aspect of IC
                 is the accurate depiction of visual relations among
                 image objects. Visual relations encompass two primary
                 facets: content relations and structural \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "133",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ma:2024:HLD,
  author =       "Zeyu Ma and Siwei Wang and Xiao Luo and Zhonghui Gu
                 and Chong Chen and Jinxing Li and Xian-Sheng Hua and
                 Guangming Lu",
  title =        "{HARR}: Learning Discriminative and High-Quality Hash
                 Codes for Image Retrieval",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "134:1--134:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3627162",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3627162",
  abstract =     "This article studies deep unsupervised hashing, which
                 has attracted increasing attention in large-scale image
                 retrieval. The majority of recent approaches usually
                 reconstruct semantic similarity information, which then
                 guides the hash code learning. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "134",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:CWS,
  author =       "Chengyang Zhang and Yong Zhang and Bo Li and Xinglin
                 Piao and Baocai Yin",
  title =        "{CrowdGraph}: Weakly supervised Crowd Counting via
                 Pure Graph Neural Network",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "135:1--135:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638774",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638774",
  abstract =     "Most existing weakly supervised crowd counting methods
                 utilize Convolutional Neural Networks (CNN) or
                 Transformer to estimate the total number of individuals
                 in an image. However, both CNN-based (grid-to-count
                 paradigm) and Transformer-based (sequence-to-.
                 \ldots{})",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "135",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:WGO,
  author =       "Jie Wang and Guoqiang Li and Jie Shi and Jinwen Xi",
  title =        "Weighted Guided Optional Fusion Network for {RGB-T}
                 Salient Object Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "136:1--136:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624984",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624984",
  abstract =     "There is no doubt that the rational and effective use
                 of visible and thermal infrared image data information
                 to achieve cross-modal complementary fusion is the key
                 to improving the performance of RGB-T salient object
                 detection (SOD). A meticulous analysis \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "136",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Zhang:2024:JAV,
  author =       "Yibo Zhang and Weiguo Lin and Junfeng Xu",
  title =        "Joint Audio-Visual Attention with Contrastive Learning
                 for More General Deepfake Detection",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "137:1--137:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625100",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625100",
  abstract =     "With the continuous advancement of deepfake
                 technology, there has been a surge in the creation of
                 realistic fake videos. Unfortunately, the malicious
                 utilization of deepfake poses a significant threat to
                 societal morality and political security. Therefore,.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "137",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Wang:2024:KIM,
  author =       "Depei Wang and Ruifeng Xu and Lianglun Cheng and
                 Zhuowei Wang",
  title =        "Knowledge-integrated Multi-modal Movie Turning Point
                 Identification",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "138:1--138:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638557",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638557",
  abstract =     "The rapid development of artificial intelligence
                 provides rich technologies and tools for the automated
                 understanding of literary works. As a comprehensive
                 carrier of storylines, movies are natural multimodal
                 data sources that provide sufficient data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "138",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:DCF,
  author =       "Chunpu Liu and Guanglei Yang and Wangmeng Zuo and
                 Tianyi Zang",
  title =        "{DPDFormer}: a Coarse-to-Fine Model for Monocular
                 Depth Estimation",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "139:1--139:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638559",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638559",
  abstract =     "Monocular depth estimation attracts great attention
                 from computer vision researchers for its convenience in
                 acquiring environment depth information. Recently
                 classification-based MDE methods show its promising
                 performance and begin to act as an essential \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "139",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yan:2024:TSP,
  author =       "Yunyao Yan and Guoqing Xiang and Huizhu Jia and Jie
                 Chen and Xiaofeng Huang and Xiaodong Xie",
  title =        "Two-Stage Perceptual Quality Oriented Rate Control
                 Algorithm for {HEVC}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "140:1--140:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3636510",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3636510",
  abstract =     "As a practical technique in mainstream video coding
                 applications, rate control dominates important to
                 ensure compression quality with limited bitrates
                 constraints. However, most rate control methods mainly
                 focus on objective quality while ignoring the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "140",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Li:2024:VDG,
  author =       "Zongyi Li and Yuxuan Shi and Hefei Ling and Jiazhong
                 Chen and Boyuan Liu and Runsheng Wang and Chengxin
                 Zhao",
  title =        "Viewpoint Disentangling and Generation for
                 Unsupervised Object {Re-ID}",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "141:1--141:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632959",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632959",
  abstract =     "Unsupervised object Re-ID aims to learn discriminative
                 identity features from a fully unlabeled dataset to
                 solve the open-class re-identification problem.
                 Satisfying results have been achieved in existing
                 unsupervised Re-ID methods, primarily trained with
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "141",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Dai:2024:TLF,
  author =       "Kuai Dai and Xutao Li and Huiwei Lin and Yin Jiang and
                 Xunlai Chen and Yunming Ye and Di Xian",
  title =        "{TinyPredNet}: a Lightweight Framework for Satellite
                 Image Sequence Prediction",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "142:1--142:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638773",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638773",
  abstract =     "Satellite image sequence prediction aims to precisely
                 infer future satellite image frames with historical
                 observations, which is a significant and challenging
                 dense prediction task. Though existing deep learning
                 models deliver promising performance for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "142",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Ma:2024:RRA,
  author =       "Yingnan Ma and Chenqiu Zhao and Bingran Huang and
                 Xudong Li and Anup Basu",
  title =        "{RAST}: Restorable Arbitrary Style Transfer",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "143:1--143:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638770",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638770",
  abstract =     "The objective of arbitrary style transfer is to apply
                 a given artistic or photo-realistic style to a target
                 image. Although current methods have shown some success
                 in transferring style, arbitrary style transfer still
                 has several issues, including content \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "143",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Hsu:2024:CDA,
  author =       "Wei-Yen Hsu and Hsien-Wen Lin",
  title =        "Context-detail-aware United Network for Single Image
                 Deraining",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "144:1--144:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3639407",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639407",
  abstract =     "Images captured outdoors are often affected by rainy
                 days, resulting in a severe deterioration in the visual
                 quality of the captured images and a decrease in the
                 performance of related applications. Therefore, single
                 image deraining has attracted \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "144",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:TSM,
  author =       "Yao Liu and Gangfeng Cui and Jiahui Luo and Xiaojun
                 Chang and Lina Yao",
  title =        "Two-stream Multi-level Dynamic Point Transformer for
                 Two-person Interaction Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "145:1--145:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3639470",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639470",
  abstract =     "As a fundamental aspect of human life, two-person
                 interactions contain meaningful information about
                 people's activities, relationships, and social
                 settings. Human action recognition serves as the
                 foundation for many smart applications, with a strong
                 focus \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "145",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Chen:2024:MCT,
  author =       "Chengxin Chen and Pengyuan Zhang",
  title =        "Modality-collaborative Transformer with Hybrid Feature
                 Reconstruction for Robust Emotion Recognition",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "146:1--146:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3640343",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3640343",
  abstract =     "As a vital aspect of affective computing, Multimodal
                 Emotion Recognition has been an active research area in
                 the multimedia community. Despite recent progress, this
                 field still confronts two major challenges in
                 real-world applications: (1) improving the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "146",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Huang:2024:UOI,
  author =       "Jiafeng Huang and Tianjun Zhang and Shengjie Zhao and
                 Lin Zhang and Yicong Zhou",
  title =        "An Underwater Organism Image Dataset and a Lightweight
                 Module Designed for Object Detection Networks",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "147:1--147:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3640465",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3640465",
  abstract =     "Long-term monitoring and recognition of underwater
                 organism objects are of great significance in marine
                 ecology, fisheries science and many other disciplines.
                 Traditional techniques in this field, including manual
                 fishing-based ones and sonar-based ones, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "147",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:PPM,
  author =       "Jing Liu and Litao Shang and Yuting Su and Weizhi Nie
                 and Xin Wen and Anan Liu",
  title =        "Privacy-preserving Multi-source Cross-domain
                 Recommendation Based on Knowledge Graph",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "148:1--148:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3639706",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639706",
  abstract =     "The cross-domain recommender systems aim to alleviate
                 the data sparsity problem in the target domain by
                 transferring knowledge from the auxiliary domain.
                 However, existing works ignore the fact that the data
                 sparsity problem may also exist in the single
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "148",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Liu:2024:BDB,
  author =       "Xingyu Liu and Zhongyun Hua and Shuang Yi and Yushu
                 Zhang and Yicong Zhou",
  title =        "Bi-directional Block Encoding for Reversible Data
                 Hiding over Encrypted Images",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "149:1--149:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3638771",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3638771",
  abstract =     "Reversible data hiding over encrypted images (RDH-EI)
                 technology is a viable solution for privacy-preserving
                 cloud storage, as it enables the reversible embedding
                 of additional data into images while maintaining image
                 confidentiality. Since the data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "149",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

@Article{Yi:2024:OVS,
  author =       "Peng Yi and Zhongyuan Wang and Laigan Luo and Kui
                 Jiang and Zheng He and Junjun Jiang and Tao Lu and
                 Jiayi Ma",
  title =        "Omniscient Video Super-Resolution with
                 Explicit-Implicit Alignment",
  journal =      j-TOMM,
  volume =       "20",
  number =       "5",
  pages =        "150:1--150:??",
  month =        may,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3640346",
  ISSN =         "1551-6857 (print), 1551-6865 (electronic)",
  ISSN-L =       "1551-6857",
  bibdate =      "Wed Apr 10 08:42:41 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tomccap.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3640346",
  abstract =     "When considering the temporal relationships, most
                 previous video super-resolution (VSR) methods follow
                 the iterative or recurrent framework. The iterative
                 framework adopts neighboring low-resolution (LR) frames
                 from a sliding window, while the recurrent \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Multimed Comput. Commun. Appl.",
  articleno =    "150",
  fjournal =     "ACM Transactions on Multimedia Computing,
                 Communications, and Applications",
  journal-URL =  "https://dl.acm.org/loi/tomm",
}

%%% [17-Apr-2021] TO DO: where are articles 104 to 116 from volume 16?
%%% There is a gap between issues 3s and 4.  I reported the problem
%%% to ACM on 24 March 2022.  The problem persists on [21-Dec-2023].